diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h --- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -681,6 +681,41 @@ Src0, Src1, Src2); } +// TODO: We could just use TernaryOp_match if we allow match() functions to +// optionally take a MachineOperand instead of always taking registers. +// We could then just have a m_ShuffleMask matcher like m_Reg for instance. +template struct GShuffleVector_match { + Src0Ty Src0; + Src1Ty Src1; + ArrayRef &ShuffleMask; + + GShuffleVector_match(const Src0Ty &Src0, const Src1Ty &Src1, + ArrayRef &ShuffleMask) + : Src0(Src0), Src1(Src1), ShuffleMask(ShuffleMask) {} + + template + bool match(const MachineRegisterInfo &MRI, OpTy &&Op) { + MachineInstr *TmpMI; + if (mi_match(Op, MRI, m_MInstr(TmpMI))) { + if (TmpMI->getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && + TmpMI->getNumOperands() == 4) { + ShuffleMask = TmpMI->getOperand(3).getShuffleMask(); + return Src0.match(MRI, TmpMI->getOperand(1).getReg()) && + Src1.match(MRI, TmpMI->getOperand(2).getReg()); + } + } + + return false; + } +}; + +template +inline GShuffleVector_match +m_GShuffleVector(const Src0Ty &Src0, const Src1Ty &Src1, + ArrayRef &ShuffleMask) { + return GShuffleVector_match(Src0, Src1, ShuffleMask); +} + /// Matches a register negated by a G_SUB. /// G_SUB 0, %negated_reg template diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -88,6 +88,12 @@ [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; +def trunc_shift_shufflevector_fold : GICombineRule< + (defs root:$trunc, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_TRUNC):$trunc, + [{ return RegBankHelper.matchTruncShiftShuffleVectorFold(*${trunc}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyTruncShiftShuffleVectorFold(*${trunc}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -128,7 +134,8 @@ def AMDGPURegBankCombinerHelper : GICombinerHelper< "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, - fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, + trunc_shift_shufflevector_fold]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -41,13 +41,15 @@ const TargetRegisterInfo &TRI; const SIInstrInfo &TII; CombinerHelper &Helper; + GISelChangeObserver &Observer; public: - AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper, + GISelChangeObserver &Observer) : B(B), MF(B.getMF()), MRI(*B.getMRI()), Subtarget(MF.getSubtarget()), RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()), - TII(*Subtarget.getInstrInfo()), Helper(Helper){}; + TII(*Subtarget.getInstrInfo()), Helper(Helper), Observer(Observer){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); @@ -74,6 +76,9 @@ void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); void applyClamp(MachineInstr &MI, Register &Reg); + bool matchTruncShiftShuffleVectorFold(MachineInstr &MI, Register &Reg); + void applyTruncShiftShuffleVectorFold(MachineInstr &MI, Register &Reg); + private: AMDGPU::SIModeRegisterDefaults getMode(); bool getIEEE(); @@ -328,6 +333,78 @@ MI.eraseFromParent(); } +bool AMDGPURegBankCombinerHelper::matchTruncShiftShuffleVectorFold( + MachineInstr &MI, Register &Reg) { + // Fold + // + // (G_TRUNC (G_LSHR + // (G_BITCAST (G_SHUFFLE_VECTOR %a, %b, shufflemask(1, ?))) + // , K)) + // + // into a simple + // + // (G_TRUNC (G_BITCAST(%a)) + // + // if the shift amount (K) is 1/2 of the destination type an + // the vector types have 2 elements. + + assert(MI.getOpcode() == AMDGPU::G_TRUNC); + + Register TruncSrc = MI.getOperand(1).getReg(); + const LLT TruncSrcTy = MRI.getType(TruncSrc); + const unsigned TruncSrcSize = TruncSrcTy.getSizeInBits(); + + Register LHS, RHS; + ArrayRef ShuffleMask; + Optional ShiftAmount; + if (!mi_match(TruncSrc, MRI, + m_GLShr(m_GBitcast(m_GShuffleVector(m_Reg(LHS), m_Reg(RHS), + ShuffleMask)), + m_GCst(ShiftAmount)))) { + return false; + } + + // The shift amount is 1/2 of the scalar type. + if (ShiftAmount->Value != (TruncSrcSize / 2)) + return false; + + // The operands of the SHUFFLE_VECTOR must be the same size as its + // destination. This limitation could be lifted if needed, but + // applyTruncShiftShuffleVectorFold will need to take it into account and + // generate additional operations to trunc the input instead of just + // generating a bitcast. + if (MRI.getType(LHS).getSizeInBits() != TruncSrcSize) + return false; + + // The SHUFFLE_VECTOR's dest is a 2-element vector + // and the first element is 1 (2nd element from first vector). + if (ShuffleMask.size() != 2 || ShuffleMask[0] != 1) + return false; + + // We can just replace the trunc src with a bitcast of the LHS of the + // G_SHUFFLE_VECTOR. + Reg = LHS; + return true; +} + +void AMDGPURegBankCombinerHelper::applyTruncShiftShuffleVectorFold( + MachineInstr &MI, Register &Reg) { + B.setInstrAndDebugLoc(MI); + + Register TruncSrc = MI.getOperand(1).getReg(); + const RegisterBank *TruncSrcRB = MRI.getRegBankOrNull(TruncSrc); + assert(TruncSrcRB && "TruncSrc has no RegBank assigned"); + + Register NewSrc = MRI.createGenericVirtualRegister(MRI.getType(TruncSrc)); + MRI.setRegBank(NewSrc, *TruncSrcRB); + + B.buildBitcast(NewSrc, Reg); + + Observer.changingInstr(MI); + MI.getOperand(1).setReg(NewSrc); + Observer.changedInstr(MI); +} + AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { return MF.getInfo()->getMode(); } @@ -400,7 +477,7 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper, Observer); AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, RegBankHelper); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift-shufflevector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift-shufflevector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift-shufflevector.mir @@ -0,0 +1,303 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: lshr16_v2s16_mask10 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask10 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = G_LSHR %3, %4 + %6:vgpr(s16) = G_TRUNC %5 + %7:vgpr(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: lshr16_v2s16_mask10_multiple_shuffle_uses +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask10_multiple_shuffle_uses + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s16>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:vgpr(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[AND:%[0-9]+]]:vgpr(s32) = G_AND [[BITCAST1]], [[ANYEXT]] + ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = G_LSHR %3, %4 + %6:vgpr(s16) = G_TRUNC %5 + %7:vgpr(s32) = G_ANYEXT %6 + %8:vgpr(s32) = G_BITCAST %2 + %9:vgpr(s32) = G_AND %8, %7 + $vgpr0 = COPY %9 +... + +--- +name: lshr16_v2s16_mask10_notrunc_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask10_notrunc_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s16>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[LSHR]](s32) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = G_LSHR %3, %4 + $vgpr0 = COPY %5 +... + +--- +name: lshr16_v2s16_v4s16_mask10_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr16_v2s16_v4s16_mask10_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<4 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s16>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 + %1:vgpr(<4 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = G_LSHR %3, %4 + %6:vgpr(s16) = G_TRUNC %5 + %7:vgpr(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: lshr8_v2s16_mask10_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr8_v2s16_mask10_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s16>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 8 + %5:vgpr(s32) = G_LSHR %3, %4 + %6:vgpr(s16) = G_TRUNC %5 + %7:vgpr(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: lshr16_v2s16_mask11 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask11 + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[BITCAST]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 1) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = G_LSHR %3, %4 + %6:vgpr(s16) = G_TRUNC %5 + %7:vgpr(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: lshr16_v2s16_mask01_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: lshr16_v2s16_mask01_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s16>), [[DEF]], shufflemask(0, 1) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s32) = G_BITCAST [[SHUF]](<2 x s16>) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s32) = G_ANYEXT [[TRUNC]](s16) + ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(<2 x s16>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s16>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(0, 1) + %3:vgpr(s32) = G_BITCAST %2 + %4:vgpr(s32) = G_CONSTANT i32 16 + %5:vgpr(s32) = G_LSHR %3, %4 + %6:vgpr(s16) = G_TRUNC %5 + %7:vgpr(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: lshr32_v2s32_mask10 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr32_v2s32_mask10 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s32) = G_TRUNC [[BITCAST]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s64) = G_ANYEXT [[TRUNC]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:vgpr(<2 x s32>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s32>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s64) = G_BITCAST %2 + %4:vgpr(s64) = G_CONSTANT i64 32 + %5:vgpr(s64) = G_LSHR %3, %4 + %6:vgpr(s32) = G_TRUNC %5 + %7:vgpr(s64) = G_ANYEXT %6 + $vgpr0_vgpr1 = COPY %7 +... + +--- +name: lshr16_v2s32_mask10_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr16_v2s32_mask10_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(1, 0) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[SHUF]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[BITCAST]], [[C]](s64) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s32) = G_TRUNC [[LSHR]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s64) = G_ANYEXT [[TRUNC]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:vgpr(<2 x s32>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s32>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 0) + %3:vgpr(s64) = G_BITCAST %2 + %4:vgpr(s64) = G_CONSTANT i64 16 + %5:vgpr(s64) = G_LSHR %3, %4 + %6:vgpr(s32) = G_TRUNC %5 + %7:vgpr(s64) = G_ANYEXT %6 + $vgpr0_vgpr1 = COPY %7 +... + +--- +name: lshr32_v2s32_mask11 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr32_v2s32_mask11 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[COPY]](<2 x s32>) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s32) = G_TRUNC [[BITCAST]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s64) = G_ANYEXT [[TRUNC]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:vgpr(<2 x s32>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s32>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(1, 1) + %3:vgpr(s64) = G_BITCAST %2 + %4:vgpr(s64) = G_CONSTANT i64 32 + %5:vgpr(s64) = G_LSHR %3, %4 + %6:vgpr(s32) = G_TRUNC %5 + %7:vgpr(s64) = G_ANYEXT %6 + $vgpr0_vgpr1 = COPY %7 +... + +--- +name: lshr32_v2s32_mask01_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: lshr32_v2s32_mask01_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:vgpr(<2 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<2 x s32>), [[DEF]], shufflemask(0, 1) + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:vgpr(s64) = G_BITCAST [[SHUF]](<2 x s32>) + ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:vgpr(s64) = G_LSHR [[BITCAST]], [[C]](s64) + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:vgpr(s32) = G_TRUNC [[LSHR]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:vgpr(s64) = G_ANYEXT [[TRUNC]](s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64) + %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:vgpr(<2 x s32>) = G_IMPLICIT_DEF + %2:vgpr(<2 x s32>) = G_SHUFFLE_VECTOR %0:vgpr, %1:vgpr, shufflemask(0, 1) + %3:vgpr(s64) = G_BITCAST %2 + %4:vgpr(s64) = G_CONSTANT i64 32 + %5:vgpr(s64) = G_LSHR %3, %4 + %6:vgpr(s32) = G_TRUNC %5 + %7:vgpr(s64) = G_ANYEXT %6 + $vgpr0_vgpr1 = COPY %7 +...