diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1434,11 +1434,6 @@ unsigned NumSrcElt = SrcVT.getVectorNumElements(); assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); - // We have some TableGen patterns for when the extracted vector is exactly - // the low or high half of the operand. - if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt)) - return Op; - // Extract 32-bit registers at a time. EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); EVT NewVT = NumElt == 2 diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1370,66 +1370,6 @@ } -def : Pat < - (extract_subvector v4i16:$vec, (i32 0)), - (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) ->; - -def : Pat < - (extract_subvector v4i16:$vec, (i32 2)), - (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) ->; - -def : Pat < - (extract_subvector v4f16:$vec, (i32 0)), - (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) ->; - -def : Pat < - (extract_subvector v4f16:$vec, (i32 2)), - (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) ->; - -def : Pat < - (extract_subvector v8i16:$vec, (i32 0)), - (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) ->; - -def : Pat < - (extract_subvector v8i16:$vec, (i32 4)), - (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) ->; - -def : Pat < - (extract_subvector v8f16:$vec, (i32 0)), - (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) ->; - -def : Pat < - (extract_subvector v8f16:$vec, (i32 4)), - (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) ->; - -def : Pat < - (extract_subvector v16i16:$vec, (i32 0)), - (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3)) ->; - -def : Pat < - (extract_subvector v16i16:$vec, (i32 8)), - (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7)) ->; - -def : Pat < - (extract_subvector v16f16:$vec, (i32 0)), - (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3)) ->; - -def : Pat < - (extract_subvector v16f16:$vec, (i32 8)), - (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7)) ->; - foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast(sub#Index) diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -366,22 +366,20 @@ ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: .LBB2_4: ; %exit -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v3, v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3900 -; GFX9-NEXT: v_mov_b32_e32 v4, 0x3d00 -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v4, vcc -; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v6, v4, v3, vcc ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5 +; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F @@ -882,22 +880,20 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: .LBB5_4: ; %exit -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v5, v5, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3800 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00 -; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc -; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v2, vcc ; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v1 src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-NEXT: v_pack_b32_f16 v1, v5, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc +; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4 +; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] br i1 undef, label %T, label %F diff --git a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll --- a/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll +++ b/llvm/test/CodeGen/AMDGPU/greedy-reverse-local-assignment.ll @@ -30,12 +30,12 @@ ; REVERSEXNACK-LABEL: shuffle_v4f16_234u: ; REVERSEXNACK: ; %bb.0: ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v1 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v0 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v3 -; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v2 -; REVERSEXNACK-NEXT: global_load_dword v0, v[5:6], off offset:4 -; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[3:4], off +; REVERSEXNACK-NEXT: v_mov_b32_e32 v6, v3 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v4, v1 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v3, v0 +; REVERSEXNACK-NEXT: v_mov_b32_e32 v5, v2 +; REVERSEXNACK-NEXT: global_load_dword v0, v[3:4], off offset:4 +; REVERSEXNACK-NEXT: global_load_dwordx2 v[1:2], v[5:6], off ; REVERSEXNACK-NEXT: s_waitcnt vmcnt(0) ; REVERSEXNACK-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -187,13 +187,13 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: v_readfirstlane_b32 s1, v1 -; VI-NEXT: s_ashr_i32 s2, s0, 16 -; VI-NEXT: s_sext_i32_i16 s0, s0 -; VI-NEXT: s_ashr_i32 s3, s1, 16 +; VI-NEXT: v_readfirstlane_b32 s0, v1 +; VI-NEXT: v_readfirstlane_b32 s1, v0 +; VI-NEXT: s_ashr_i32 s2, s1, 16 ; VI-NEXT: s_sext_i32_i16 s1, s1 -; VI-NEXT: s_ashr_i32 s0, s0, s1 +; VI-NEXT: s_ashr_i32 s3, s0, 16 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_ashr_i32 s0, s1, s0 ; VI-NEXT: s_ashr_i32 s1, s2, s3 ; VI-NEXT: s_lshl_b32 s1, s1, 16 ; VI-NEXT: s_and_b32 s0, s0, 0xffff