diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -294,6 +294,12 @@ bool applyBuildInstructionSteps(MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo); + /// Match ashr (shl x, C), C -> sext_inreg (C) + bool matchAshrShlToSextInreg(MachineInstr &MI, + std::tuple &MatchInfo); + bool applyAshShlToSextInreg(MachineInstr &MI, + std::tuple &MatchInfo); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -251,6 +251,12 @@ return BinaryOp_match(L, R); } +template +inline BinaryOp_match +m_GAShr(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc template struct UnaryOp_match { SrcTy L; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -284,6 +284,15 @@ (apply [{ return Helper.applyBuildInstructionSteps(*${root}, ${info});}]) >; +// Fold ashr (shl x, C), C -> sext_inreg (C) +def shl_ashr_to_sext_inreg_matchinfo : GIDefMatchData<"std::tuple">; +def shl_ashr_to_sext_inreg : GICombineRule< + (defs root:$root, shl_ashr_to_sext_inreg_matchinfo:$info), + (match (wip_match_opcode G_ASHR): $root, + [{ return Helper.matchAshrShlToSextInreg(*${root}, ${info}); }]), + (apply [{ return Helper.applyAshShlToSextInreg(*${root}, ${info});}]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -301,4 +310,5 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, simplify_add_to_sub, - hoist_logic_op_with_same_opcode_hands]>; + hoist_logic_op_with_same_opcode_hands, + shl_ashr_to_sext_inreg]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1887,6 +1887,36 @@ return true; } +bool CombinerHelper::matchAshrShlToSextInreg( + MachineInstr &MI, std::tuple &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_ASHR); + int64_t ShlCst, AshrCst; + Register Src; + // FIXME: detect splat constant vectors. + if (!mi_match(MI.getOperand(0).getReg(), MRI, + m_GAShr(m_GShl(m_Reg(Src), m_ICst(ShlCst)), m_ICst(AshrCst)))) + return false; + if (ShlCst != AshrCst) + return false; + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_SEXT_INREG, {MRI.getType(Src)}})) + return false; + MatchInfo = {Src, ShlCst}; + return true; +} +bool CombinerHelper::applyAshShlToSextInreg( + MachineInstr &MI, std::tuple &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_ASHR); + Register Src; + int64_t ShiftAmt; + std::tie(Src, ShiftAmt) = MatchInfo; + unsigned Size = MRI.getType(Src).getScalarSizeInBits(); + Builder.setInstrAndDebugLoc(MI); + Builder.buildSExtInReg(MI.getOperand(0).getReg(), Src, Size - ShiftAmt); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir @@ -0,0 +1,90 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- +name: ashr_shl_to_sext_inreg +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: ashr_shl_to_sext_inreg + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s16) = G_SEXT_INREG [[TRUNC]], 8 + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SEXT_INREG]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %1:_(s32) = COPY $w0 + %0:_(s16) = G_TRUNC %1(s32) + %2:_(s16) = G_CONSTANT i16 8 + %3:_(s16) = G_SHL %0, %2(s16) + %4:_(s16) = exact G_ASHR %3, %2(s16) + %5:_(s32) = G_ANYEXT %4(s16) + $w0 = COPY %5(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: different_shift_amts +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: different_shift_amts + ; CHECK: liveins: $w0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 12 + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16) + ; CHECK: [[ASHR:%[0-9]+]]:_(s16) = exact G_ASHR [[SHL]], [[C1]](s16) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16) + ; CHECK: $w0 = COPY [[ANYEXT]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + %1:_(s32) = COPY $w0 + %0:_(s16) = G_TRUNC %1(s32) + %2:_(s16) = G_CONSTANT i16 12 + %4:_(s16) = G_CONSTANT i16 8 + %3:_(s16) = G_SHL %0, %2(s16) + %5:_(s16) = exact G_ASHR %3, %4(s16) + %6:_(s32) = G_ANYEXT %5(s16) + $w0 = COPY %6(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: ashr_shl_to_sext_inreg_vector +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$d0' } +body: | + bb.1: + liveins: $d0 + ; Currently don't support this for vectors just yet, this will need updating + ; when we do. + ; CHECK-LABEL: name: ashr_shl_to_sext_inreg_vector + ; CHECK: liveins: $d0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0 + ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16) + ; CHECK: [[SHL:%[0-9]+]]:_(<4 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR]](<4 x s16>) + ; CHECK: [[ASHR:%[0-9]+]]:_(<4 x s16>) = exact G_ASHR [[SHL]], [[BUILD_VECTOR]](<4 x s16>) + ; CHECK: $d0 = COPY [[ASHR]](<4 x s16>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(<4 x s16>) = COPY $d0 + %2:_(s16) = G_CONSTANT i16 8 + %1:_(<4 x s16>) = G_BUILD_VECTOR %2(s16), %2(s16), %2(s16), %2(s16) + %3:_(<4 x s16>) = G_SHL %0, %1(<4 x s16>) + %4:_(<4 x s16>) = exact G_ASHR %3, %1(<4 x s16>) + $d0 = COPY %4(<4 x s16>) + RET_ReallyLR implicit $d0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -674,8 +674,7 @@ ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 -; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_ashr_i32 s0, s0, 8 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x180000 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -830,8 +829,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s2, s2, s0 ; GFX6-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -854,8 +852,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s2, s2, s0 ; GFX6-NEXT: s_bfe_i32 s0, s2, 8 -; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_ashr_i32 s0, s0, 24 +; GFX6-NEXT: s_sext_i32_i8 s0, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -879,8 +876,7 @@ ; GFX6-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %ptr, align 1 @@ -904,8 +900,7 @@ ; GFX6-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 8, 0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 +; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %ptr, align 1 @@ -927,8 +922,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -951,8 +945,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 30 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10001 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -975,8 +968,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 30 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20000 ; GFX6-NEXT: s_bfe_i32 s0, s0, 0x20001 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -423,8 +423,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s0, s0, 31 -; GFX6-NEXT: s_ashr_i32 s0, s0, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x10000 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -950,22 +949,22 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 63 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s0, s0, 63 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { %src = load i32, i32 addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -3415,8 +3415,7 @@ ; CGP-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; CGP-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v0, 7, v0 +; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25 ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and i64 %num, 16777215 @@ -3736,10 +3735,8 @@ ; CGP-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v3| ; CGP-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; CGP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; CGP-NEXT: v_lshlrev_b32_e32 v2, 7, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v0, 7, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 7, v2 +; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25 +; CGP-NEXT: v_bfe_i32 v2, v2, 0, 25 ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3363,8 +3363,7 @@ ; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CGP-NEXT: v_mul_lo_u32 v1, v2, v1 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v0, 7, v0 +; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25 ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and i64 %num, 16777215 @@ -3677,20 +3676,18 @@ ; CGP-NEXT: v_rcp_f32_e32 v5, v4 ; CGP-NEXT: v_ashrrev_i32_e32 v6, 30, v6 ; CGP-NEXT: v_or_b32_e32 v6, 1, v6 -; CGP-NEXT: v_lshlrev_b32_e32 v0, 7, v0 +; CGP-NEXT: v_bfe_i32 v0, v0, 0, 25 ; CGP-NEXT: v_mul_f32_e32 v5, v1, v5 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_mad_f32 v1, -v5, v4, v1 ; CGP-NEXT: v_cvt_i32_f32_e32 v5, v5 ; CGP-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v4| ; CGP-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; CGP-NEXT: v_ashrrev_i32_e32 v0, 7, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 ; CGP-NEXT: v_mul_lo_u32 v3, v1, v3 ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v2, 7, v2 -; CGP-NEXT: v_ashrrev_i32_e32 v2, 7, v2 +; CGP-NEXT: v_bfe_i32 v2, v2, 0, 25 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %num.mask = and <2 x i64> %num,