diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -1000,7 +1000,7 @@ EVT NVT = EVT::getIntegerVT(*DAG.getContext(), LVT.getSizeInBits() + HVT.getSizeInBits()); - EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout(), false); + EVT ShiftAmtVT = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo); Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi); Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi, diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -602,17 +602,15 @@ ; SI-LABEL: v_bswap_v2i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v2 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v2 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v2i16: @@ -635,21 +633,20 @@ ; SI-LABEL: v_bswap_v3i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v4, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v3, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: s_mov_b32 s4, 0xff00ff +; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8 ; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v3 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v4 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v3 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 ; SI-NEXT: v_bfi_b32 v2, s4, v2, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v3i16: @@ -674,27 +671,25 @@ ; SI-LABEL: v_bswap_v4i16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8 -; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 +; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 ; SI-NEXT: s_mov_b32 s4, 0xff00ff -; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8 -; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 -; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8 +; SI-NEXT: v_alignbit_b32 v5, v3, v3, 8 ; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24 -; SI-NEXT: v_bfi_b32 v1, s4, v1, v4 -; SI-NEXT: v_bfi_b32 v0, s4, v0, v5 -; SI-NEXT: v_bfi_b32 v3, s4, v3, v6 -; SI-NEXT: v_bfi_b32 v2, s4, v2, v7 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_alignbit_b32 v6, v0, v0, 8 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 +; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 +; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 +; SI-NEXT: v_bfi_b32 v2, s4, v2, v4 +; SI-NEXT: v_bfi_b32 v3, s4, v3, v5 +; SI-NEXT: v_bfi_b32 v0, s4, v0, v6 +; SI-NEXT: v_bfi_b32 v1, s4, v1, v7 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_bswap_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -71,10 +71,7 @@ ; R600-NOT: MOV ; GFX6: s_mov_b32 s3, 0xf000 ; GFX6: s_waitcnt lgkmcnt(0) -; GFX6: s_lshr_b32 s2, s2, 16 -; GFX6: s_or_b32 s4, s2, 0x50000 -; GFX6: s_mov_b32 s2, -1 -; GFX6: v_mov_b32_e32 v0, s4 +; GFX6: v_alignbit_b32 v0, 5, s4, 16 ; GFX6: buffer_store_dword v0, off, s[0:3], 0 ; GFX8: s_mov_b32 s3, 0xf000 ; GFX8: s_mov_b32 s2, -1 diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1392,27 +1392,29 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[12:13], s[4:5] ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2 +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s8, s2 ; SI-NEXT: s_mov_b32 s9, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v2 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 +; SI-NEXT: v_or_b32_e32 v5, v5, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v4, v6 +; SI-NEXT: v_or_b32_e32 v6, v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 -; SI-NEXT: v_alignbit_b32 v5, v3, v5, 24 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24 +; SI-NEXT: v_or_b32_e32 v4, v4, v6 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0 ; SI-NEXT: s_endpgm @@ -1421,7 +1423,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_mov_b32 s8, 0xc0c0001 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 @@ -1431,35 +1433,32 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ubyte v6, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[0:1] +; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v3, v[2:3] ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: s_mov_b32 s2, s6 ; VI-NEXT: s_mov_b32 s3, s7 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v5, v5, v3 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 +; VI-NEXT: v_lshlrev_b16_e32 v6, 8, v5 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_perm_b32 v4, v4, v5, s8 +; VI-NEXT: v_perm_b32 v5, 0, v5, s8 +; VI-NEXT: v_or_b32_sdwa v4, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v5, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1468,24 +1467,25 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v7, 0 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 ; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 -; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 8, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshl_or_b32 v6, v2, 8, v4 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 +; GFX10-NEXT: v_lshlrev_b16 v4, 8, v0 +; GFX10-NEXT: v_perm_b32 v5, 0, v0, 0xc0c0001 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 -; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v7, v4, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v6, v4, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: @@ -1495,19 +1495,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 ; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 ; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 -; GFX9-NEXT: s_mov_b32 s4, 0x4000405 +; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v7, v2, 8, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 +; GFX9-NEXT: v_perm_b32 v6, 0, v6, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dword v5, v4, s[2:3] ; GFX9-NEXT: s_endpgm @@ -1518,21 +1519,23 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: s_clause 0x2 ; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 ; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3 -; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3 ; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshl_or_b32 v2, v3, 8, v1 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b16 v4, 8, v2 +; GFX11-NEXT: v_perm_b32 v5, 0, v2, 0xc0c0001 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshl_or_b32 v5, v2, 8, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 +; GFX11-NEXT: v_or_b32_e32 v4, v0, v4 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_lshlrev_b32 v4, 16, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] ; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] @@ -1572,23 +1575,23 @@ ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v7 -; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v5 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v5 ; SI-NEXT: v_or_b32_e32 v0, v6, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_and_b32_e32 v4, 0xff000000, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1600,7 +1603,6 @@ ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v5, 9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1613,19 +1615,19 @@ ; VI-NEXT: s_mov_b32 s2, s6 ; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 -; VI-NEXT: v_add_u16_e32 v8, 9, v4 -; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_and_b32_e32 v6, 0xffffff00, v4 +; VI-NEXT: v_add_u16_e32 v4, 9, v4 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_nop 0 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 -; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v5 +; VI-NEXT: v_add_u16_e32 v2, 9, v5 +; VI-NEXT: v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v2, 0x900 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1637,18 +1639,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 ; GFX10-NEXT: v_add_nc_u16 v4, v0, 9 -; GFX10-NEXT: v_add_nc_u16 v2, v2, 9 -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v1 +; GFX10-NEXT: v_add_nc_u16 v1, v1, 9 +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 @@ -1669,26 +1670,25 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffffff00, v4 -; GFX9-NEXT: v_add_u16_e32 v9, 9, v4 -; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 9, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; GFX9-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v6 +; GFX9-NEXT: v_add_u16_e32 v2, 9, v6 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 ; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1705,29 +1705,27 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_nc_u16 v1, v1, 9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u16 v3, v1, 9 ; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v2 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900 ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -450,14 +450,13 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-NEXT: s_and_b32 s3, s3, 0xffff0000 -; GCN-NEXT: s_or_b32 s2, s2, s3 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; @@ -506,9 +505,8 @@ ; GCN-LABEL: divergent_vec_i16_HH: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: divergent_vec_i16_HH: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -182,12 +182,12 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: fneg_xor_select_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -238,12 +238,12 @@ } ; FUNC-LABEL: {{^}}v_fneg_v2i16: -; SI: v_lshlrev_b32_e32 v1, 16, v1 +; SI: v_xor_b32_e32 v1, 0x8000, v1 ; SI: v_xor_b32_e32 v0, 0x8000, v0 -; SI: v_xor_b32_e32 v1, 0x80000000, v1 +; SI: v_lshlrev_b32_e32 v2, 16, v1 ; SI: v_and_b32_e32 v0, 0xffff, v0 -; SI: v_or_b32_e32 v0, v0, v1 -; SI: v_lshrrev_b32_e32 v1, 16, v1 +; SI: v_or_b32_e32 v0, v0, v2 +; SI: v_and_b32_e32 v1, 0xffff, v1 ; VI: s_waitcnt ; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1851,23 +1851,27 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GFX7-NEXT: v_bfe_u32 v4, v2, 16, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX7-NEXT: v_alignbit_b32 v2, v4, v2, 16 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_alignbit_b32 v3, s10, v3, 16 +; GFX7-NEXT: v_alignbit_b32 v0, v7, v0, 16 +; GFX7-NEXT: v_alignbit_b32 v3, 0, v3, 16 ; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v7, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v8, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2015,40 +2015,40 @@ ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) ; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v6, v2, 4, 4 +; GFX7-NEXT: v_bfe_i32 v7, v2, 4, 4 ; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 -; GFX7-NEXT: v_bfe_i32 v13, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX7-NEXT: v_bfe_i32 v14, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 ; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v7, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 ; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 ; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 ; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 -; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2444,36 +2444,36 @@ ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v8, 15, v2 -; GFX7-NEXT: v_bfe_u32 v7, v2, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v9, 15, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v15, 15, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 12, v2 -; GFX7-NEXT: v_bfe_u32 v14, v0, 4, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 12, v0 +; GFX7-NEXT: v_and_b32_e32 v16, 15, v0 +; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 +; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 +; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 +; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 12, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 +; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 +; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 +; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 12, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xf000000, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xf000000, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v13, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v9, 0xf000000, v9 -; GFX7-NEXT: v_and_b32_e32 v16, 0xf000000, v16 +; GFX7-NEXT: v_alignbit_b32 v2, s10, v2, 24 +; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 24 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 -; GFX7-NEXT: v_alignbit_b32 v9, s10, v9, 24 -; GFX7-NEXT: v_alignbit_b32 v8, 0, v16, 24 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 -; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v8, v1 -; GFX7-NEXT: v_bfe_u32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v12, v1 -; GFX7-NEXT: v_bfe_u32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX7-NEXT: v_mad_u32_u24 v1, v4, v11, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 28, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v10, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -237,12 +237,11 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v2, s4 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 -; CI-NEXT: s_or_b32 s0, s1, s0 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_lshr_b32 s0, s2, 16 +; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -311,13 +310,13 @@ ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; CI-NEXT: s_or_b32 s1, s0, s1 -; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART ; CI-NEXT: ; use s0 @@ -774,11 +773,10 @@ ; CI-NEXT: flat_load_dword v3, v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: s_lshr_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; CI-NEXT: v_alignbit_b32 v2, v2, s4, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -287,38 +287,34 @@ ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v4, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v3, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16: @@ -1208,38 +1204,34 @@ ; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v4, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v2, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v4, v2, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v5, v3, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16: @@ -5866,62 +5858,58 @@ ; GFX67-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v4, v3, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v5, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v4, v2, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v3, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v4, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v5, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v4, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v5, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v5, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v5, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v4, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v3, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v5, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v2, v5, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v5, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v2, v4, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v3, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: @@ -6193,62 +6181,58 @@ ; GFX67-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v4, v3, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v5, v2, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v4, v2, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v4, v3, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v5, v2, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v5, v3, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v4, v2, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v5, v3, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v4, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v5, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v0, v4, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v4, 1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v5, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v1, v5, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v2, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v5, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v4, v1 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v3, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v5, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v2, v5, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v0, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v5, vcc, s4, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v2, v4, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v3, v5, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 0x10000, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v4, v3 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 -; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v6, v0, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v4, v2, v4, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v1, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v5, v3, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v1, v3, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v2, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v4 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v7 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v2, v4, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v2, v1 ; GFX67-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -8068,20 +8068,19 @@ ; GFX6-NOHSA: ; %bb.0: ; GFX6-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s2, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s6, s2, 0x80010 -; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s5 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s2, s4 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff +; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -8094,15 +8093,14 @@ ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 8 -; GFX7-HSA-NEXT: s_bfe_u32 s3, s2, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s1, s3, s1 +; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16 ; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 +; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8312,28 +8310,26 @@ ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s6, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s8, s5, 0x80010 +; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s9, s9, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s8, s7 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s6 -; GFX6-NOHSA-NEXT: s_or_b32 s6, s10, s9 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s2 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -8342,29 +8338,27 @@ ; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_and_b32 s1, s3, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 8 -; GFX7-HSA-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff0000 -; GFX7-HSA-NEXT: s_or_b32 s1, s3, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 8 -; GFX7-HSA-NEXT: s_or_b32 s4, s5, s4 -; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24 +; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00 +; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8 -; GFX7-HSA-NEXT: s_or_b32 s3, s5, s3 -; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0 +; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4 +; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 +; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8689,47 +8683,42 @@ ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s9, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s10, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s11, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s13, s5, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s15, s4, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s7, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s17, s7, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 ; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s19, s6, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6 ; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s11, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s12, s13, s12 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s11 -; GFX6-NOHSA-NEXT: s_or_b32 s11, s15, s14 -; GFX6-NOHSA-NEXT: s_or_b32 s13, s17, s16 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s9 -; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s18 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10 ; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -8739,52 +8728,48 @@ ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_and_b32 s9, s5, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s5, 8 -; GFX7-HSA-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24 +; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24 +; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24 +; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00 +; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00 +; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s10, s10, 0xff0000 -; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9 -; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 8 -; GFX7-HSA-NEXT: s_and_b32 s3, s7, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s10, s11, s10 -; GFX7-HSA-NEXT: s_and_b32 s9, s9, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s11, s4, 0x80010 +; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8 ; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-HSA-NEXT: s_or_b32 s9, s11, s9 -; GFX7-HSA-NEXT: s_or_b32 s4, s4, s8 -; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 8 -; GFX7-HSA-NEXT: s_bfe_u32 s11, s7, 0x80010 +; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 ; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s3, s3, 8 -; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff0000 -; GFX7-HSA-NEXT: s_or_b32 s3, s7, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 8 -; GFX7-HSA-NEXT: s_or_b32 s8, s11, s8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff ; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8 -; GFX7-HSA-NEXT: s_or_b32 s7, s11, s7 -; GFX7-HSA-NEXT: s_or_b32 s2, s6, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12 +; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10 +; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8 +; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -9361,91 +9346,80 @@ ; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s13, s7, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s14, s4, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s15, s5, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s16, s2, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s17, s3, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s18, s0, 0xff00 -; GFX6-NOHSA-NEXT: s_and_b32 s19, s1, 0xff00 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s1, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s21, s1, 0x80010 +; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24 +; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00 +; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s23, s0, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s3, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s25, s3, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 ; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s27, s2, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2 ; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s29, s5, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5 ; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s31, s4, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s33, s7, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s34, s7, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7 ; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff -; GFX6-NOHSA-NEXT: s_lshr_b32 s35, s6, 8 -; GFX6-NOHSA-NEXT: s_bfe_u32 s36, s6, 0x80010 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6 ; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NOHSA-NEXT: s_and_b32 s20, s20, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s19, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s22, s22, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16 +; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s24, s24, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s26, s26, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s28, s28, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s15, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s30, s30, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s33, s33, 0xff0000 -; GFX6-NOHSA-NEXT: s_lshl_b32 s13, s13, 8 -; GFX6-NOHSA-NEXT: s_and_b32 s35, s35, 0xff0000 +; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16 ; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8 -; GFX6-NOHSA-NEXT: s_or_b32 s20, s21, s20 -; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s19 -; GFX6-NOHSA-NEXT: s_or_b32 s19, s23, s22 -; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s18 -; GFX6-NOHSA-NEXT: s_or_b32 s18, s25, s24 -; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s17 -; GFX6-NOHSA-NEXT: s_or_b32 s17, s27, s26 -; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s16 -; GFX6-NOHSA-NEXT: s_or_b32 s16, s29, s28 -; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s15 -; GFX6-NOHSA-NEXT: s_or_b32 s15, s31, s30 -; GFX6-NOHSA-NEXT: s_or_b32 s21, s34, s33 -; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s13 -; GFX6-NOHSA-NEXT: s_or_b32 s13, s36, s35 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1 +; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2 +; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8 +; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4 +; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9 +; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16 +; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6 +; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14 ; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12 -; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s20 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NOHSA-NEXT: s_endpgm ; @@ -9455,102 +9429,94 @@ ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_and_b32 s17, s1, 0xff00 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s1, 8 -; GFX7-HSA-NEXT: s_bfe_u32 s19, s1, 0x80010 +; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24 +; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24 +; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24 +; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00 +; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00 ; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s17, s17, 8 -; GFX7-HSA-NEXT: s_and_b32 s18, s18, 0xff0000 -; GFX7-HSA-NEXT: s_or_b32 s17, s1, s17 -; GFX7-HSA-NEXT: s_lshr_b32 s1, s0, 8 -; GFX7-HSA-NEXT: s_and_b32 s16, s0, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s18, s19, s18 -; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s19, s0, 0x80010 -; GFX7-HSA-NEXT: s_or_b32 s19, s19, s1 +; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24 +; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24 ; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 -; GFX7-HSA-NEXT: s_or_b32 s16, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s0, s3, 8 -; GFX7-HSA-NEXT: s_and_b32 s15, s3, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s1, s3, 0x80010 -; GFX7-HSA-NEXT: s_or_b32 s20, s1, s0 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8 +; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00 +; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1 ; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24 +; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00 ; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s0, s2, 8 -; GFX7-HSA-NEXT: s_and_b32 s14, s2, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s1, s2, 0x80010 -; GFX7-HSA-NEXT: s_or_b32 s15, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8 +; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00 ; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s0, s5, 8 -; GFX7-HSA-NEXT: s_and_b32 s13, s5, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s1, s5, 0x80010 -; GFX7-HSA-NEXT: s_or_b32 s14, s1, s0 ; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s13, 8 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00 +; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24 ; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s0, s4, 8 -; GFX7-HSA-NEXT: s_and_b32 s12, s4, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s1, s4, 0x80010 -; GFX7-HSA-NEXT: s_or_b32 s13, s1, s0 +; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0 ; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 -; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s0, s7, 8 -; GFX7-HSA-NEXT: s_and_b32 s11, s7, 0xff00 -; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s1, s7, 0x80010 -; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0 -; GFX7-HSA-NEXT: s_and_b32 s1, s7, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s7, s11, 8 +; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00 -; GFX7-HSA-NEXT: s_or_b32 s1, s1, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 8 -; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff0000 -; GFX7-HSA-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX7-HSA-NEXT: s_and_b32 s6, s6, 0xff -; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-HSA-NEXT: s_or_b32 s7, s11, s7 -; GFX7-HSA-NEXT: s_or_b32 s6, s6, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16 +; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24 +; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0 +; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff +; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8 +; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5 ; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -116,21 +116,29 @@ ; GFX10-LABEL: shuffle1004: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_load_dword v6, v[0:1], off -; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: global_load_dword v6, v[2:3], off +; GFX10-NEXT: global_load_dword v7, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v6 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x40405 +; GFX10-NEXT: v_perm_b32 v1, 0, v7, 0xc0c0001 +; GFX10-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10-NEXT: global_store_dword v[4:5], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: shuffle1004: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v6, v[0:1], off -; GFX9-NEXT: global_load_dword v7, v[2:3], off -; GFX9-NEXT: s_mov_b32 s4, 0x40405 +; GFX9-NEXT: global_load_dword v6, v[2:3], off +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, 0, v7, s4 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -205,10 +213,15 @@ ; GFX10-LABEL: shuffle0554: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: ds_read_b32 v0, v0 ; GFX10-NEXT: ds_read_b32 v1, v1 +; GFX10-NEXT: ds_read_b32 v0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v1 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0xc0c0001 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x10104 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: ds_write_b32 v2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -216,11 +229,16 @@ ; GFX9-LABEL: shuffle0554: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: ds_read_b32 v0, v0 ; GFX9-NEXT: ds_read_b32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x10104 +; GFX9-NEXT: ds_read_b32 v0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff00, v1 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: ds_write_b32 v2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -432,7 +450,9 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x7060607 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0xc0c0001 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v0, v1 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -440,9 +460,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x7060607 +; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX9-NEXT: s_mov_b32 s4, 0xffff0000 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -789,16 +812,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 -; GFX10-NEXT: v_add_nc_u16 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00 ; GFX10-NEXT: v_add_nc_u16 v3, v2, v9 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: global_store_dword v[5:6], v1, off -; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_store: @@ -806,19 +830,19 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_add_u16_e32 v3, v0, v9 -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_or_b32_e32 v1, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v9 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: global_store_dword v[5:6], v1, off -; GFX9-NEXT: global_store_dword v[7:8], v0, off +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: global_store_dword v[5:6], v0, off +; GFX9-NEXT: global_store_dword v[7:8], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %vec0 = load <4 x i8>, ptr addrspace(1) %in0, align 4 @@ -848,16 +872,17 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, 8, v4 -; GFX10-NEXT: v_add_nc_u16 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0xffffff00 ; GFX10-NEXT: v_add_nc_u16 v3, v2, v9 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 -; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: global_store_dword v[5:6], v1, off -; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: add_store_div_16: @@ -871,17 +896,17 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v1, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_sdwa v2, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_add_u16_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: global_store_dword v[5:6], v0, off ; GFX9-NEXT: global_store_dword v[7:8], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -911,23 +936,28 @@ ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff00 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 24, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b16 v1, 8, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v9 -; GFX10-NEXT: v_lshrrev_b16 v3, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX10-NEXT: v_lshrrev_b16 v10, 8, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX10-NEXT: v_and_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-NEXT: v_add_nc_u16 v2, v9, v2 -; GFX10-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-NEXT: v_add_nc_u16 v1, v1, v10 +; GFX10-NEXT: v_add_nc_u16 v4, v10, v9 +; GFX10-NEXT: v_add_nc_u16 v1, v1, v11 +; GFX10-NEXT: v_perm_b32 v9, 0, v9, 0xc0c0001 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 ; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX10-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX10-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v9 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x10705 +; GFX10-NEXT: v_or_b32_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -939,24 +969,30 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v4, v[2:3], off ; GFX9-NEXT: global_load_dword v9, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x10705 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_movk_i32 s4, 0xff00 +; GFX9-NEXT: s_mov_b32 s5, 0xc0c0001 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v1, v9, v4, s4 -; GFX9-NEXT: v_add_u16_sdwa v2, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v3, v4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_add_u16_sdwa v9, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 8, v9 +; GFX9-NEXT: v_and_b32_sdwa v2, v9, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b16_e32 v3, 8, v0 +; GFX9-NEXT: v_perm_b32 v4, 0, v0, s5 +; GFX9-NEXT: v_add_u16_sdwa v10, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v1, v1, v0 +; GFX9-NEXT: v_add_u16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -977,25 +1013,24 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: global_load_dword v9, v[2:3], off +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[2:3], off +; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v0, 2 ; GFX10-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x102 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_and_b32_e32 v2, 0x100, v4 +; GFX10-NEXT: v_and_b32_sdwa v2, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x5070006 +; GFX10-NEXT: v_and_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v3, 0x100, v9 +; GFX10-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x5070006 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1011,22 +1046,21 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x5070006 ; GFX9-NEXT: v_mov_b32_e32 v0, 2 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-NEXT: s_movk_i32 s5, 0x102 -; GFX9-NEXT: s_mov_b32 s4, 0x5070006 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v2, 0x100, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_and_b32_sdwa v1, v2, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 +; GFX9-NEXT: v_and_b32_sdwa v3, v9, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v9, 0x100, v4 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v3, off +; GFX9-NEXT: global_store_dword v[7:8], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1052,21 +1086,21 @@ ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, 26 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 24, v9 -; GFX10-NEXT: v_bfe_i32 v2, v9, 0, 8 -; GFX10-NEXT: v_lshlrev_b16 v3, 6, v1 -; GFX10-NEXT: v_lshlrev_b16 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_bfe_i32 v1, v9, 0, 8 +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 24, v9 +; GFX10-NEXT: v_ashrrev_i32_sdwa v2, v2, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v1, 7, v1 +; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ashrrev_i16 v4, 10, v0 ; GFX10-NEXT: v_perm_b32 v0, v9, v0, 0x4010707 -; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v3 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v2 -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff00, v1 +; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off ; GFX10-NEXT: global_store_dword v[7:8], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1082,23 +1116,23 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v0, 26 +; GFX9-NEXT: v_mov_b32_e32 v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 7 ; GFX9-NEXT: s_mov_b32 s4, 0x4010707 -; GFX9-NEXT: v_mov_b32_e32 v0, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 24, v4 +; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v2, v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v2, v4, v9, s4 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v0, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 6, v1 -; GFX9-NEXT: v_ashrrev_i16_e32 v3, 10, v9 -; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffffff00, v0 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff00, v4 -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v9, s4 +; GFX9-NEXT: v_ashrrev_i16_e32 v9, 10, v9 +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v2, off +; GFX9-NEXT: global_store_dword v[7:8], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2069,20 +2103,24 @@ ; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 24, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX10-NEXT: v_sub_nc_u16 v2, v1, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_sub_nc_u16 v3, v0, v3 -; GFX10-NEXT: v_sub_nc_u16 v9, v1, v4 -; GFX10-NEXT: v_sub_nc_u16 v10, v4, v2 -; GFX10-NEXT: v_sub_nc_u16 v1, v4, v1 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x6070007 +; GFX10-NEXT: v_sub_nc_u16 v9, v4, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_sub_nc_u16 v10, v1, v4 +; GFX10-NEXT: v_perm_b32 v4, 0, v4, 0xc0c0001 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_lshlrev_b16 v4, 8, v9 -; GFX10-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v2, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off ; GFX10-NEXT: global_store_dword v[7:8], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2092,27 +2130,31 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x6070007 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0xc0c0001 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; GFX9-NEXT: v_sub_u16_sdwa v9, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v4, v2, v0, s4 -; GFX9-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_sub_u16_e32 v2, v3, v2 -; GFX9-NEXT: v_sub_u16_e32 v1, v3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v4, off +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX9-NEXT: v_sub_u16_sdwa v3, v4, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_sub_u16_e32 v9, v1, v9 +; GFX9-NEXT: v_sub_u16_sdwa v10, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2 +; GFX9-NEXT: v_perm_b32 v4, 0, v2, s4 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -1598,12 +1598,11 @@ ; GFX67-LABEL: v_mul_add_1_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0x10000, v3 -; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 @@ -1642,11 +1641,10 @@ ; GFX67-LABEL: v_mul_add_1_v2i16_commute: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0x10000, v3 -; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 1, v3 ; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0 @@ -1731,12 +1729,11 @@ ; GFX67-LABEL: v_mul_sub_1_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, -1, v2 -; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0xffff0000, v3 -; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, -1, v3 ; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 @@ -1775,11 +1772,10 @@ ; GFX67-LABEL: v_mul_sub_1_v2i16_commute: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, -1, v2 -; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0xffff0000, v3 -; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, -1, v3 ; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_mul_u32_u24_e32 v0, v2, v0 @@ -1866,12 +1862,11 @@ ; GFX67-LABEL: v_mul_add_2_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, 2, v2 -; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0x20000, v3 -; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, 2, v3 ; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 @@ -1910,12 +1905,11 @@ ; GFX67-LABEL: v_mul_sub_2_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX67-NEXT: v_add_i32_e32 v2, vcc, -2, v2 -; GFX67-NEXT: v_add_i32_e32 v3, vcc, 0xfffe0000, v3 -; GFX67-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_add_i32_e32 v3, vcc, -2, v3 ; GFX67-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX67-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX67-NEXT: v_mul_u32_u24_e32 v1, v1, v3 @@ -2677,15 +2671,14 @@ ; GFX67-LABEL: v_mul_9_add_52_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, 9, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, 9, 52 ; GFX67-NEXT: v_mad_u32_u24 v0, v0, 9, 52 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_add_i32_e32 v1, vcc, 0x340000, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_9_add_52_v2i16: @@ -2912,15 +2905,14 @@ ; GFX67-LABEL: v_mul_5_add_1_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_mul_u32_u24_e32 v1, 5, v1 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, 5, 1 ; GFX67-NEXT: v_mad_u32_u24 v0, v0, 5, 1 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-NEXT: v_add_i32_e32 v1, vcc, 0x10000, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_5_add_1_v2i16: @@ -2955,17 +2947,16 @@ ; GFX67-LABEL: v_mul_284_add_82_v2i16: ; GFX67: ; %bb.0: ; GFX67-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-NEXT: s_movk_i32 s4, 0x11c -; GFX67-NEXT: v_mul_u32_u24_e32 v1, 0x11c, v1 ; GFX67-NEXT: v_mov_b32_e32 v2, 0x52 +; GFX67-NEXT: v_mad_u32_u24 v1, v1, s4, v2 ; GFX67-NEXT: v_mad_u32_u24 v0, v0, s4, v2 -; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX67-NEXT: v_and_b32_e32 v0, 0xfffe, v0 -; GFX67-NEXT: v_add_i32_e32 v1, vcc, 0x520000, v1 -; GFX67-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX67-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX67-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX67-NEXT: v_and_b32_e32 v1, 0xfffe, v1 ; GFX67-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_mul_284_add_82_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -105,10 +105,12 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_or_b32_e32 v2, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v0, v3 +; SI-NEXT: v_or_b32_e32 v0, v2, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -119,10 +121,12 @@ ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 +; VI-NEXT: v_or_b32_e32 v2, v1, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_or_b32_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: @@ -141,10 +145,12 @@ ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; SI-NEXT: v_or_b32_e32 v2, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff00, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; SI-NEXT: v_or_b32_e32 v1, v0, v3 +; SI-NEXT: v_or_b32_e32 v0, v2, v3 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -207,17 +207,16 @@ ; GFX6-LABEL: v_usubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_u32_e32 v1, v1, v4 +; GFX6-NEXT: v_max_u32_e32 v1, v1, v3 ; GFX6-NEXT: v_max_u32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_usubsat_v2i16: