Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -342,6 +342,9 @@ /// success. bool matchAndWithTrivialMask(MachineInstr &MI, Register &Replacement); + /// \return true if \p MI is a G_SEXT_INREG that can be erased. + bool matchRedundantSExtInReg(MachineInstr &MI); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -344,6 +344,16 @@ (apply [{ return Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }]) >; +// If the input is already sign extended, just drop the extension. +// sext_inreg x, K -> +// if computeNumSignBits(x) >= (x.getScalarSizeInBits() - K + 1) +def redundant_sext_inreg: GICombineRule < + (defs root:$root), + (match (wip_match_opcode G_SEXT_INREG):$root, + [{ return Helper.matchRedundantSExtInReg(*${root}); }]), + (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 1); }]) +>; + // FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, undef_to_negative_one, @@ -357,7 +367,8 @@ binop_right_to_zero, p2i_to_i2p, i2p_to_p2i]>; -def known_bits_simplifications : GICombineGroup<[and_trivial_mask]>; +def known_bits_simplifications : GICombineGroup<[ + and_trivial_mask, redundant_sext_inreg]>; def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2151,6 +2151,14 @@ return KB->maskedValueIsZero(Replacement, ~Mask); } +bool CombinerHelper::matchRedundantSExtInReg(MachineInstr &MI) { + // If the input is already sign extended, just drop the extension. + Register Src = MI.getOperand(1).getReg(); + unsigned ExtBits = MI.getOperand(2).getImm(); + unsigned TypeSize = MRI.getType(Src).getScalarSizeInBits(); + return KB->computeNumSignBits(Src) >= (TypeSize - ExtBits + 1); +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-inreg.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-sext-inreg.mir @@ -0,0 +1,189 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: sext_inreg_s32_7_sextload_from_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_s32_7_sextload_from_1 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load 1, addrspace 1) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SEXTLOAD]], 7 + ; GCN: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (load 1, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 7 + $vgpr0 = COPY %2 + +... + +--- +name: sext_inreg_s32_8_sextload_from_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_s32_8_sextload_from_1 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load 1, addrspace 1) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SEXTLOAD]], 8 + ; GCN: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (load 1, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 8 + $vgpr0 = COPY %2 + +... + +--- +name: sext_inreg_s32_9_sextload_from_1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_s32_9_sextload_from_1 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load 1, addrspace 1) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SEXTLOAD]], 9 + ; GCN: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_SEXTLOAD %0 :: (load 1, addrspace 1) + %2:_(s32) = G_SEXT_INREG %1, 9 + $vgpr0 = COPY %2 + +... + +--- +name: sext_inreg_s32_7_sext_from_s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_s32_7_sext_from_s8 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load 1, addrspace 1) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SEXTLOAD]], 7 + ; GCN: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s8) = G_LOAD %0 :: (load 1, addrspace 1) + %2:_(s32) = G_SEXT %1 + %3:_(s32) = G_SEXT_INREG %2, 7 + $vgpr0 = COPY %3 + +... + +--- +name: sext_inreg_s32_8_sext_from_s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_s32_8_sext_from_s8 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load 1, addrspace 1) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SEXTLOAD]], 8 + ; GCN: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s8) = G_LOAD %0 :: (load 1, addrspace 1) + %2:_(s32) = G_SEXT %1 + %3:_(s32) = G_SEXT_INREG %2, 8 + $vgpr0 = COPY %3 + +... + +--- +name: sext_inreg_s32_8_sext_from_s9 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_s32_8_sext_from_s9 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p1) :: (load 1, addrspace 1) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[SEXTLOAD]], 9 + ; GCN: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(s8) = G_LOAD %0 :: (load 1, addrspace 1) + %2:_(s32) = G_SEXT %1 + %3:_(s32) = G_SEXT_INREG %2, 9 + $vgpr0 = COPY %3 + +... + +--- +name: sext_inreg_v2s32_7_sext_from_v2s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_v2s32_7_sext_from_v2s8 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) + ; GCN: [[SEXT:%[0-9]+]]:_(<2 x s32>) = G_SEXT [[LOAD]](<2 x s8>) + ; GCN: [[SEXT_INREG:%[0-9]+]]:_(<2 x s32>) = G_SEXT_INREG [[SEXT]], 7 + ; GCN: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](<2 x s32>) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(<2 x s8>) = G_LOAD %0 :: (load 2, addrspace 1) + %2:_(<2 x s32>) = G_SEXT %1 + %3:_(<2 x s32>) = G_SEXT_INREG %2, 7 + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: sext_inreg_v2s32_8_sext_from_v2s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_v2s32_8_sext_from_v2s8 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) + ; GCN: [[SEXT:%[0-9]+]]:_(<2 x s32>) = G_SEXT [[LOAD]](<2 x s8>) + ; GCN: $vgpr0_vgpr1 = COPY [[SEXT]](<2 x s32>) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(<2 x s8>) = G_LOAD %0 :: (load 2, addrspace 1) + %2:_(<2 x s32>) = G_SEXT %1 + %3:_(<2 x s32>) = G_SEXT_INREG %2, 8 + $vgpr0_vgpr1 = COPY %3 + +... + +--- +name: sext_inreg_v2s32_9_sext_from_v2s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + + ; GCN-LABEL: name: sext_inreg_v2s32_9_sext_from_v2s8 + ; GCN: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GCN: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 + ; GCN: [[LOAD:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[COPY]](p1) :: (load 2, addrspace 1) + ; GCN: [[SEXT:%[0-9]+]]:_(<2 x s32>) = G_SEXT [[LOAD]](<2 x s8>) + ; GCN: $vgpr0_vgpr1 = COPY [[SEXT]](<2 x s32>) + %0:_(p1) = COPY $vgpr0_vgpr1 + %1:_(<2 x s8>) = G_LOAD %0 :: (load 2, addrspace 1) + %2:_(<2 x s32>) = G_SEXT %1 + %3:_(<2 x s32>) = G_SEXT_INREG %2, 9 + $vgpr0_vgpr1 = COPY %3 + +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -1021,8 +1021,7 @@ ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 -; GFX8-NEXT: s_bfe_i32 s6, 0, 0x180000 -; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], s6, v0 +; GFX8-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] @@ -1079,8 +1078,7 @@ ; GFX8-NEXT: s_cmp_lt_i32 s3, s0 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000 -; GFX8-NEXT: s_bfe_i32 s4, 0, 0x180000 -; GFX8-NEXT: s_cmp_lt_i32 s1, s4 +; GFX8-NEXT: s_cmp_lt_i32 s1, 0 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_xor_b32 s0, s1, s0 ; GFX8-NEXT: s_ashr_i32 s1, s3, 23 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -1021,8 +1021,7 @@ ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 ; GFX8-NEXT: v_bfe_i32 v0, v1, 0, 24 -; GFX8-NEXT: s_bfe_i32 s6, 0, 0x180000 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], s6, v0 +; GFX8-NEXT: v_cmp_lt_i32_e64 s[6:7], 0, v0 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 23, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xff800000, v0 ; GFX8-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] @@ -1079,8 +1078,7 @@ ; GFX8-NEXT: s_cmp_lt_i32 s3, s0 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_bfe_i32 s1, s1, 0x180000 -; GFX8-NEXT: s_bfe_i32 s4, 0, 0x180000 -; GFX8-NEXT: s_cmp_gt_i32 s1, s4 +; GFX8-NEXT: s_cmp_gt_i32 s1, 0 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_xor_b32 s0, s1, s0 ; GFX8-NEXT: s_ashr_i32 s1, s3, 23