diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -97,19 +97,25 @@ (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>; def foldable_fneg_matchdata : GIDefMatchData<"MachineInstr *">; - def foldable_fneg : GICombineRule< (defs root:$ffn, foldable_fneg_matchdata:$matchinfo), (match (wip_match_opcode G_FNEG):$ffn, [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; +def trunc_right_shift_reduction_matchdata : GIDefMatchData<"MachineInstr *">; +def trunc_right_shift : GICombineRule< + (defs root:$trunc, trunc_right_shift_reduction_matchdata:$matchinfo), + (match (wip_match_opcode G_TRUNC):$trunc, + [{ return Helper.matchTruncRightShiftReduction(*${trunc}, ${matchinfo}); }]), + (apply [{ Helper.applyTruncRightShiftReduction(*${trunc}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPreLegalizerCombinerHelper", - [all_combines, clamp_i64_to_i16, foldable_fneg]> { + [all_combines, clamp_i64_to_i16, foldable_fneg, trunc_right_shift]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; let AdditionalArguments = []; @@ -119,7 +125,7 @@ "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq]> { + rcp_sqrt_to_rsq, trunc_right_shift]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h @@ -23,4 +23,9 @@ bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); + + bool matchTruncRightShiftReduction(MachineInstr &MI, + MachineInstr *&MatchInfo); + void applyTruncRightShiftReduction(MachineInstr &MI, + MachineInstr *&MatchInfo); }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -9,6 +9,7 @@ #include "AMDGPUCombinerHelper.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" @@ -380,3 +381,58 @@ MI.eraseFromParent(); } + +bool AMDGPUCombinerHelper::matchTruncRightShiftReduction( + MachineInstr &MI, MachineInstr *&MatchInfo) { + assert(MI.getOpcode() == AMDGPU::G_TRUNC); + + // Shrink >32 bits right shifts to 32-bit if truncated to <32 bits: + // e.g.: + // (i16 (trunc (i64 (sr[la] X, K)))) + // -> (i16 (trunc (i32 (sr[la] (i32 (trunc X)), K)))) + // + // Note: this only needs to handle right shifts as the generic trunc_shl + // combine handles left shifts. + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + const unsigned DstSize = DstTy.getSizeInBits(); + if (DstSize >= 32) + return false; + + Register Src = MI.getOperand(1).getReg(); + MachineInstr *SrcMI = getDefIgnoringCopies(Src, MRI); + + if (MRI.getType(Src).getSizeInBits() <= 32 || !MRI.hasOneNonDBGUse(Src) || + (SrcMI->getOpcode() != AMDGPU::G_ASHR && + SrcMI->getOpcode() != AMDGPU::G_LSHR)) + return false; + + Register ShiftSrc = SrcMI->getOperand(1).getReg(); + Register ShiftAmt = SrcMI->getOperand(2).getReg(); + + // Don't do the transformation if we risk losing information. + if (KB->getKnownBits(ShiftAmt).getMaxValue().ugt(32 - DstSize)) + return false; + + MatchInfo = SrcMI; + return true; +} + +void AMDGPUCombinerHelper::applyTruncRightShiftReduction( + MachineInstr &MI, MachineInstr *&MatchInfo) { + Builder.setInstrAndDebugLoc(MI); + + Register ShiftSrc = MatchInfo->getOperand(1).getReg(); + Register ShiftAmt = MatchInfo->getOperand(2).getReg(); + + // Trunc shift src to 32 bits. + Register NewShiftSrc = MRI.createGenericVirtualRegister(LLT::scalar(32)); + Builder.buildTrunc(NewShiftSrc, ShiftSrc); + + // Create a 32 bits shift. + Register NewShiftDst = MRI.createGenericVirtualRegister(LLT::scalar(32)); + Builder.buildInstr(MatchInfo->getOpcode(), {NewShiftDst}, + {NewShiftSrc, ShiftAmt}); + + // Fix the trunc operand. + replaceRegOpWith(MRI, MI.getOperand(1), NewShiftDst); +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-right-shift.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-right-shift.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-right-shift.mir @@ -0,0 +1,139 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: s16_trunc_s64_lshr_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: s16_trunc_s64_lshr_16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], %amt(s32) + ; CHECK-NEXT: %trunc:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK-NEXT: %foo:_(s16) = G_CONSTANT i16 55 + ; CHECK-NEXT: %keep:_(s32) = G_MERGE_VALUES %trunc(s16), %foo(s16) + ; CHECK-NEXT: $vgpr0 = COPY %keep(s32) + %0:_(s32) = COPY $vgpr0 + %src:_(s64) = G_ZEXT %0 + %amt:_(s32) = G_CONSTANT i32 16 + %shift:_(s64) = G_LSHR %src, %amt + %trunc:_(s16) = G_TRUNC %shift + %foo:_(s16) = G_CONSTANT i16 55 + %keep:_(s32) = G_MERGE_VALUES %trunc, %foo + $vgpr0 = COPY %keep +... + +--- +name: s16_trunc_s64_ashr_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: s16_trunc_s64_ashr_16 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], %amt(s32) + ; CHECK-NEXT: %trunc:_(s16) = G_TRUNC [[ASHR]](s32) + ; CHECK-NEXT: %foo:_(s16) = G_CONSTANT i16 55 + ; CHECK-NEXT: %keep:_(s32) = G_MERGE_VALUES %trunc(s16), %foo(s16) + ; CHECK-NEXT: $vgpr0 = COPY %keep(s32) + %0:_(s32) = COPY $vgpr0 + %src:_(s64) = G_ZEXT %0 + %amt:_(s32) = G_CONSTANT i32 16 + %shift:_(s64) = G_ASHR %src, %amt + %trunc:_(s16) = G_TRUNC %shift + %foo:_(s16) = G_CONSTANT i16 55 + %keep:_(s32) = G_MERGE_VALUES %trunc, %foo + $vgpr0 = COPY %keep +... + +--- +name: s16_trunc_s64_lshr_17_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: s16_trunc_s64_lshr_17_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %src:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: %shift:_(s64) = G_LSHR %src, %amt(s32) + ; CHECK-NEXT: %trunc:_(s16) = G_TRUNC %shift(s64) + ; CHECK-NEXT: %foo:_(s16) = G_CONSTANT i16 55 + ; CHECK-NEXT: %keep:_(s32) = G_MERGE_VALUES %trunc(s16), %foo(s16) + ; CHECK-NEXT: $vgpr0 = COPY %keep(s32) + %0:_(s32) = COPY $vgpr0 + %src:_(s64) = G_ZEXT %0 + %amt:_(s32) = G_CONSTANT i32 17 + %shift:_(s64) = G_LSHR %src, %amt + %trunc:_(s16) = G_TRUNC %shift + %foo:_(s16) = G_CONSTANT i16 55 + %keep:_(s32) = G_MERGE_VALUES %trunc, %foo + $vgpr0 = COPY %keep +... + +--- +name: s26_trunc_s64_lshr_6 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: s26_trunc_s64_lshr_6 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], %amt(s32) + ; CHECK-NEXT: %trunc:_(s26) = G_TRUNC [[LSHR]](s32) + ; CHECK-NEXT: %foo:_(s26) = G_CONSTANT i26 55 + ; CHECK-NEXT: %keep0:_(s26) = G_ADD %trunc, %foo + ; CHECK-NEXT: %keep1:_(s32) = G_ANYEXT %keep0(s26) + ; CHECK-NEXT: $vgpr0 = COPY %keep1(s32) + %0:_(s32) = COPY $vgpr0 + %src:_(s64) = G_ZEXT %0 + %amt:_(s32) = G_CONSTANT i32 6 + %shift:_(s64) = G_LSHR %src, %amt + %trunc:_(s26) = G_TRUNC %shift + %foo:_(s26) = G_CONSTANT i26 55 + %keep0:_(s26) = G_ADD %trunc, %foo + %keep1:_(s32) = G_ANYEXT %keep0 + $vgpr0 = COPY %keep1 +... + +--- +name: s26_trunc_s64_lshr_7_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: s26_trunc_s64_lshr_7_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: %src:_(s64) = G_ZEXT [[COPY]](s32) + ; CHECK-NEXT: %amt:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: %shift:_(s64) = G_LSHR %src, %amt(s32) + ; CHECK-NEXT: %trunc:_(s26) = G_TRUNC %shift(s64) + ; CHECK-NEXT: %foo:_(s26) = G_CONSTANT i26 55 + ; CHECK-NEXT: %keep0:_(s26) = G_ADD %trunc, %foo + ; CHECK-NEXT: %keep1:_(s32) = G_ANYEXT %keep0(s26) + ; CHECK-NEXT: $vgpr0 = COPY %keep1(s32) + %0:_(s32) = COPY $vgpr0 + %src:_(s64) = G_ZEXT %0 + %amt:_(s32) = G_CONSTANT i32 7 + %shift:_(s64) = G_LSHR %src, %amt + %trunc:_(s26) = G_TRUNC %shift + %foo:_(s26) = G_CONSTANT i26 55 + %keep0:_(s26) = G_ADD %trunc, %foo + %keep1:_(s32) = G_ANYEXT %keep0 + $vgpr0 = COPY %keep1 +...