diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -88,6 +88,12 @@ [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; +def shift_shufflevector_fold : GICombineRule< + (defs root:$shift, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_LSHR):$shift, + [{ return RegBankHelper.matchShiftShuffleVectorFold(*${shift}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyShiftShuffleVectorFold(*${shift}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -128,7 +134,8 @@ def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, + shift_shufflevector_fold]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -74,6 +74,9 @@ void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); void applyClamp(MachineInstr &MI, Register &Reg); + bool matchShiftShuffleVectorFold(MachineInstr &MI, Register &Reg); + void applyShiftShuffleVectorFold(MachineInstr &MI, Register &Reg); + private: AMDGPU::SIModeRegisterDefaults getMode(); bool getIEEE(); @@ -328,6 +331,57 @@ MI.eraseFromParent(); } +bool AMDGPURegBankCombinerHelper::matchShiftShuffleVectorFold(MachineInstr &MI, + Register &Reg) { + // Fold (G_LSHR (G_BITCAST (G_SHUFFLE_VECTOR %a, %b, shufflemask(1, ?))), K) + // into a simple G_BITCAST(%a) if the shift amount (K) is 1/2 of the + // destination type and the vector types have 2 elements. + // + // TODO: This could be made more generic (e.g. to support more vector sizes, + // or even LSHL) + // if need be. + + assert(MI.getOpcode() == AMDGPU::G_LSHR); + + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + const auto ShiftAmount = + getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + + // The shift amount is 1/2 of the scalar type. + if (!ShiftAmount || ShiftAmount->Value != (DstTy.getSizeInBits() / 2)) + return false; + + // The shift operand is a bitcast. + MachineInstr *ShiftOp = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); + if (ShiftOp->getOpcode() != AMDGPU::G_BITCAST) + return false; + + // The bitcast src is a SHUFFLE_VECTOR. + ShiftOp = getDefIgnoringCopies(ShiftOp->getOperand(1).getReg(), MRI); + if (ShiftOp->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR) + return false; + + // The SHUFFLE_VECTOR's dst type is a 2-element vector. + LLT VectorTy = MRI.getType(ShiftOp->getOperand(0).getReg()); + if (VectorTy.getNumElements() != 2) + return false; + + // The shufflemask starts with 1. + if (ShiftOp->getOperand(3).getShuffleMask()[0] != 1) + return false; + + // We can just replace this with a bitcast of the LHS of the ShiftOp. + Reg = ShiftOp->getOperand(1).getReg(); + return true; +} + +void AMDGPURegBankCombinerHelper::applyShiftShuffleVectorFold(MachineInstr &MI, + Register &Reg) { + B.setInstrAndDebugLoc(MI); + B.buildBitcast(MI.getOperand(0).getReg(), Reg); + MI.removeFromParent(); +} + AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { return MF.getInfo()->getMode(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-shufflevector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-shufflevector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-shufflevector.mir @@ -0,0 +1,186 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: lshr16_v2s16_mask10 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask10 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(1, 0) + %3:_(s32) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 16 + %5:_(s32) = G_LSHR %3, %4 + $vgpr0 = COPY %5 +... + +--- +name: lshr8_v2s16_mask10_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr8_v2s16_mask10_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s16>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(1, 0) + %3:_(s32) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 8 + %5:_(s32) = G_LSHR %3, %4 + $vgpr0 = COPY %5 +... + +--- +name: lshr16_v2s16_mask11 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask11 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(1, 1) + %3:_(s32) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 16 + %5:_(s32) = G_LSHR %3, %4 + $vgpr0 = COPY %5 +... + +--- +name: lshr16_v2s16_mask01_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask01_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s16>), [[DEF]], shufflemask(0, 1) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = G_IMPLICIT_DEF + %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(0, 1) + %3:_(s32) = G_BITCAST %2 + %4:_(s32) = G_CONSTANT i32 16 + %5:_(s32) = G_LSHR %3, %4 + $vgpr0 = COPY %5 +... + +--- +name: lshr32_v2s32_mask10 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr32_v2s32_mask10 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](s64) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(1, 0) + %3:_(s64) = G_BITCAST %2 + %4:_(s64) = G_CONSTANT i64 32 + %5:_(s64) = G_LSHR %3, %4 + $vgpr0_vgpr1 = COPY %5 +... + +--- +name: lshr16_v2s32_mask10_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr16_v2s32_mask10_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[SHUF]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BITCAST]], [[C]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(1, 0) + %3:_(s64) = G_BITCAST %2 + %4:_(s64) = G_CONSTANT i64 16 + %5:_(s64) = G_LSHR %3, %4 + $vgpr0_vgpr1 = COPY %5 +... + +--- +name: lshr32_v2s32_mask11 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr32_v2s32_mask11 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](s64) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(1, 1) + %3:_(s64) = G_BITCAST %2 + %4:_(s64) = G_CONSTANT i64 32 + %5:_(s64) = G_LSHR %3, %4 + $vgpr0_vgpr1 = COPY %5 +... + +--- +name: lshr32_v2s32_mask01_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr32_v2s32_mask01_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(0, 1) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[SHUF]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BITCAST]], [[C]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0:_, %1:_, shufflemask(0, 1) + %3:_(s64) = G_BITCAST %2 + %4:_(s64) = G_CONSTANT i64 32 + %5:_(s64) = G_LSHR %3, %4 + $vgpr0_vgpr1 = COPY %5 +...