Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -183,6 +183,13 @@ bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + /// Reduce a shift by a constant to an unmerge and a shift on a half sized + /// type. This will not produce a shift smaller than \p TargetShiftSize. + bool matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, + unsigned &ShiftVal); + bool applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal); + bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1402,6 +1402,70 @@ return true; } +bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, + unsigned TargetShiftSize, + unsigned &ShiftVal) { + assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a shift"); + + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + if (Ty.isVector()) // TODO: + return false; + + // Don't narrow further than the requested size. + unsigned Size = Ty.getSizeInBits(); + if (Size <= TargetShiftSize) + return false; + + auto MaybeImmVal = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!MaybeImmVal) + return false; + + ShiftVal = MaybeImmVal->Value; + return ShiftVal >= Size / 2 && ShiftVal < Size; +} + +// dst = G_LSHR s64:x, C for C >= 32 +// => +// lo, hi = G_UNMERGE_VALUES x +// dst = merge_values (G_LSHR hi, C - 32), 0 +bool CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI, + const unsigned &ShiftVal) { + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(SrcReg); + unsigned Size = Ty.getSizeInBits(); + + assert(ShiftVal >= Size / 2); + LLT HalfTy = LLT::scalar(Size / 2); + + Builder.setInstr(MI); + auto Unmerge = Builder.buildUnmerge(HalfTy, SrcReg); + + Register Narrowed = Unmerge.getReg(1); + if (ShiftVal > Size / 2) { + Narrowed = Builder.buildLShr( + HalfTy, Unmerge.getReg(1), + Builder.buildConstant(HalfTy, ShiftVal - Size / 2)).getReg(0); + } + + Builder.buildMerge( + DstReg, { Narrowed, Builder.buildConstant(HalfTy, 0).getReg(0) }); + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI, + unsigned TargetShiftAmount) { + unsigned ShiftAmt; + if (matchCombineShiftToUnmerge(MI, TargetShiftAmount, ShiftAmt)) { + applyCombineShiftToUnmerge(MI, ShiftAmt); + return true; + } + + return false; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; Index: llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -165,6 +165,11 @@ return true; switch (MI.getOpcode()) { + case TargetOpcode::G_LSHR: + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + return Helper.tryCombineShiftToUnmerge(MI, 32); case TargetOpcode::G_CONCAT_VECTORS: return Helper.tryCombineConcatVectors(MI); case TargetOpcode::G_SHUFFLE_VECTOR: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir @@ -0,0 +1,202 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: narrow_lshr_s64_32_s64amt +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_32_s64amt + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV1]](s32), [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 32 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s64_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV1]](s32), [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 32 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s64_33 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_33 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LSHR]](s32), [[C1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 33 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s64_31 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_31 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 31 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s64_63 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_63 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LSHR]](s32), [[C1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 63 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s64_64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 64 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s64_65 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_s64_65 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65 + ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 65 + %2:_(s64) = G_LSHR %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_lshr_s32_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: narrow_lshr_s32_16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0 = COPY [[LSHR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 16 + %2:_(s32) = G_LSHR %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: narrow_lshr_s32_17 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: narrow_lshr_s32_17 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: $vgpr0 = COPY [[LSHR]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(s32) = G_LSHR %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: narrow_lshr_v2s32_17 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_lshr_v2s32_17 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; CHECK: [[LSHR:%[0-9]+]]:_(<2 x s32>) = G_LSHR [[COPY]], [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: $vgpr0_vgpr1 = COPY [[LSHR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(<2 x s32>) = G_BUILD_VECTOR %1, %1 + %3:_(<2 x s32>) = G_LSHR %0, %2 + $vgpr0_vgpr1 = COPY %3 +...