Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -33,6 +33,7 @@ class MachineDominatorTree; class LegalizerInfo; struct LegalityQuery; +class TargetLowering; struct PreferredTuple { LLT Ty; // The result type of the extend. @@ -52,6 +53,11 @@ Register Base; }; +struct RegisterImmPair { + Register Reg; + int64_t Imm; +}; + using OperandBuildSteps = SmallVector, 4>; struct InstructionBuildSteps { @@ -90,6 +96,8 @@ return KB; } + const TargetLowering &getTargetLowering() const; + /// \return true if the combine is running prior to legalization, or if \p /// Query is legal on the target. bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const; @@ -218,6 +226,12 @@ bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + // Transform a G_SHL with an extended source into a narrower shift if + // possible. + bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData); + bool applyCombineShlOfExtend(MachineInstr &MI, + const RegisterImmPair &MatchData); + /// Reduce a shift by a constant to an unmerge and a shift on a half sized /// type. This will not produce a shift smaller than \p TargetShiftSize. bool matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -375,6 +375,13 @@ EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes = true) const; + /// Return the preferred type to use for a shift opcode, given the shifted + /// amount type is \p ShiftValueTy. + LLVM_READONLY + virtual LLT getPreferredShiftAmountTy(LLT ShiftValueTy) const { + return ShiftValueTy; + } + /// Returns the type to be used for the index operand of: /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, /// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -160,6 +160,14 @@ [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>; +// shl ([asz]ext x), y => zext (shl x, y), if shift does not overflow int +def reduce_shl_of_extend_matchdata : GIDefMatchData<"RegisterImmPair">; +def reduce_shl_of_extend : GICombineRule< + (defs root:$dst, reduce_shl_of_extend_matchdata:$matchinfo), + (match (G_SHL $dst, $src0, $src1):$mi, + [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]), + (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>; + // [us]itofp(undef) = 0, because the result value is bounded. def undef_to_fp_zero : GICombineRule< (defs root:$root), @@ -297,8 +305,10 @@ binop_right_to_zero, p2i_to_i2p, i2p_to_p2i]>; +def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>; + def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, simplify_add_to_sub, - hoist_logic_op_with_same_opcode_hands]>; + hoist_logic_op_with_same_opcode_hands, width_reduction_combines]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -43,6 +43,10 @@ (void)this->KB; } +const TargetLowering &CombinerHelper::getTargetLowering() const { + return *Builder.getMF().getSubtarget().getTargetLowering(); +} + bool CombinerHelper::isLegalOrBeforeLegalizer( const LegalityQuery &Query) const { return !LI || LI->getAction(Query).Action == LegalizeActions::Legal; @@ -1439,6 +1443,59 @@ return true; } +// shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source +bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, + RegisterImmPair &MatchData) { + assert(MI.getOpcode() == TargetOpcode::G_SHL && KB); + + Register LHS = MI.getOperand(1).getReg(); + + Register ExtSrc; + if (!mi_match(LHS, MRI, m_GAnyExt(m_Reg(ExtSrc))) && + !mi_match(LHS, MRI, m_GZExt(m_Reg(ExtSrc))) && + !mi_match(LHS, MRI, m_GSExt(m_Reg(ExtSrc)))) + return false; + + // TODO: Should handle vector splat. + Register RHS = MI.getOperand(2).getReg(); + auto MaybeShiftAmtVal = getConstantVRegValWithLookThrough(RHS, MRI); + if (!MaybeShiftAmtVal) + return false; + + if (LI) { + LLT SrcTy = MRI.getType(ExtSrc); + + // We only really care about the legality with the shifted value. We can + // pick any type the constant shift amount, so ask the target what to + // use. Otherwise we would have to guess and hope it is reported as legal. + LLT ShiftAmtTy = getTargetLowering().getPreferredShiftAmountTy(SrcTy); + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_SHL, {SrcTy, ShiftAmtTy}})) + return false; + } + + int64_t ShiftAmt = MaybeShiftAmtVal->Value; + MatchData.Reg = ExtSrc; + MatchData.Imm = ShiftAmt; + + unsigned MinLeadingZeros = KB->getKnownZeroes(ExtSrc).countLeadingOnes(); + return MinLeadingZeros >= ShiftAmt; +} + +bool CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI, + const RegisterImmPair &MatchData) { + Register ExtSrcReg = MatchData.Reg; + int64_t ShiftAmtVal = MatchData.Imm; + + LLT ExtSrcTy = MRI.getType(ExtSrcReg); + Builder.setInstrAndDebugLoc(MI); + auto ShiftAmt = Builder.buildConstant(ExtSrcTy, ShiftAmtVal); + auto NarrowShift = + Builder.buildShl(ExtSrcTy, ExtSrcReg, ShiftAmt, MI.getFlags()); + Builder.buildZExt(MI.getOperand(0), NarrowShift); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, unsigned &ShiftVal) { Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -53,6 +53,7 @@ // FIXME: Is there a way to remove a single item from all_combines? def all_combines_minus_extload : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combine_indexed_load_store, undef_combines, + width_reduction_combines, identity_combines] >; Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -372,6 +372,8 @@ EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; + LLT getPreferredShiftAmountTy(LLT Ty) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4301,6 +4301,12 @@ return (VT == MVT::i16) ? MVT::i16 : MVT::i32; } +LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { + return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) + ? Ty.changeElementSize(16) + : Ty.changeElementSize(32); +} + // Answering this is somewhat tricky and depends on the specific device which // have different rates for fma or all f64 operations. // Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir @@ -0,0 +1,311 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s + +# Can't narrow this; need known bits +--- +name: shl_s64_by_2_from_anyext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: shl_s64_by_2_from_anyext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; GFX9-LABEL: name: shl_s64_by_2_from_anyext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_ANYEXT %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s64) = G_SHL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +# Can't narrow this; need known bits +--- +name: shl_s64_by_2_from_sext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: shl_s64_by_2_from_sext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32) + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; GFX9-LABEL: name: shl_s64_by_2_from_sext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_SEXT %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s64) = G_SHL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +# Can't narrow this; need known bits +--- +name: shl_s64_by_2_from_zext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: shl_s64_by_2_from_zext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX6: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; GFX9-LABEL: name: shl_s64_by_2_from_zext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s32) = COPY $vgpr0 + %1:_(s64) = G_ZEXT %0 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s64) = G_SHL %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: narrow_shl_s64_by_2_from_anyext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_anyext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_anyext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_ANYEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_sext_s32 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_sext_s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_sext_s32 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_SEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s32_lookthrough_amount +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s32_lookthrough_amount + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(s32) = COPY $vgpr0 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX6: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s32_lookthrough_amount + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(s32) = COPY $vgpr0 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masked:_(s32) = G_AND %narrow, %masklow30 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %narrow:_(s32) = COPY $vgpr0 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masked:_(s32) = G_AND %narrow, %masklow30 + %extend:_(s64) = G_ZEXT %masked + %shiftamt64:_(s64) = G_CONSTANT i64 2 + %shiftamt:_(s32) = G_TRUNC %shiftamt64 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +# Can't introduce a 16-bit shift before gfx8 +--- +name: narrow_shl_s32_by_2_from_zext_s16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: %extend:_(s32) = G_ZEXT %masked(s16) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shl:_(s32) = G_SHL %extend, %shiftamt(s32) + ; GFX6: $vgpr0 = COPY %shl(s32) + ; GFX9-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s32) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0 = COPY %shl(s32) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s32) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s32) = G_SHL %extend, %shiftamt + $vgpr0 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s16 +tracksRegLiveness: true +legalized: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: %extend:_(s64) = G_ZEXT %masked(s16) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shl:_(s64) = G_SHL %extend, %shiftamt(s32) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s64) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir @@ -0,0 +1,197 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s + +--- +name: narrow_shl_s32_by_2_from_zext_s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX6: %shl:_(s32) = G_ZEXT [[SHL]](s16) + ; GFX6: $vgpr0 = COPY %shl(s32) + ; GFX9-LABEL: name: narrow_shl_s32_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s32) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0 = COPY %shl(s32) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s32) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s32) = G_SHL %extend, %shiftamt + $vgpr0 = COPY %shl +... + +--- +name: narrow_shl_s64_by_2_from_zext_s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX6: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX6: %shl:_(s64) = G_ZEXT [[SHL]](s16) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(s64) + ; GFX9-LABEL: name: narrow_shl_s64_by_2_from_zext_s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s16) = G_TRUNC %argument(s32) + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masked:_(s16) = G_AND %narrow, %masklow14 + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16) + ; GFX9: %shl:_(s64) = G_ZEXT [[SHL]](s16) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(s64) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s16) = G_TRUNC %argument + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masked:_(s16) = G_AND %narrow, %masklow14 + %extend:_(s64) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shl:_(s64) = G_SHL %extend, %shiftamt + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_s16_by_2_from_zext_s8 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_s16_by_2_from_zext_s8 + ; GFX6: liveins: $vgpr0 + ; GFX6: %argument:_(s32) = COPY $vgpr0 + ; GFX6: %narrow:_(s8) = G_TRUNC %argument(s32) + ; GFX6: %masklow6:_(s8) = G_CONSTANT i8 63 + ; GFX6: %masked:_(s8) = G_AND %narrow, %masklow6 + ; GFX6: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 + ; GFX6: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8) + ; GFX6: %shl:_(s16) = G_ZEXT [[SHL]](s8) + ; GFX6: %result:_(s32) = G_ANYEXT %shl(s16) + ; GFX6: $vgpr0 = COPY %result(s32) + ; GFX9-LABEL: name: narrow_shl_s16_by_2_from_zext_s8 + ; GFX9: liveins: $vgpr0 + ; GFX9: %argument:_(s32) = COPY $vgpr0 + ; GFX9: %narrow:_(s8) = G_TRUNC %argument(s32) + ; GFX9: %masklow6:_(s8) = G_CONSTANT i8 63 + ; GFX9: %masked:_(s8) = G_AND %narrow, %masklow6 + ; GFX9: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2 + ; GFX9: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8) + ; GFX9: %shl:_(s16) = G_ZEXT [[SHL]](s8) + ; GFX9: %result:_(s32) = G_ANYEXT %shl(s16) + ; GFX9: $vgpr0 = COPY %result(s32) + %argument:_(s32) = COPY $vgpr0 + %narrow:_(s8) = G_TRUNC %argument + %masklow6:_(s8) = G_CONSTANT i8 63 + %masked:_(s8) = G_AND %narrow, %masklow6 + %extend:_(s16) = G_ZEXT %masked + %shiftamt:_(s16) = G_CONSTANT i16 2 + %shl:_(s16) = G_SHL %extend, %shiftamt + %result:_(s32) = G_ANYEXT %shl + $vgpr0 = COPY %result +... + +--- +name: narrow_shl_v2s32_by_2_from_zext_v2s16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; GFX6-LABEL: name: narrow_shl_v2s32_by_2_from_zext_v2s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: %narrow:_(<2 x s16>) = COPY $vgpr0 + ; GFX6: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX6: %masklow14vec:_(<2 x s16>) = G_BUILD_VECTOR %masklow14(s16), %masklow14(s16) + ; GFX6: %masked:_(<2 x s16>) = G_AND %narrow, %masklow14vec + ; GFX6: %extend:_(<2 x s32>) = G_ZEXT %masked(<2 x s16>) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX6: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX6: $vgpr0_vgpr1 = COPY %shl(<2 x s32>) + ; GFX9-LABEL: name: narrow_shl_v2s32_by_2_from_zext_v2s16 + ; GFX9: liveins: $vgpr0 + ; GFX9: %narrow:_(<2 x s16>) = COPY $vgpr0 + ; GFX9: %masklow14:_(s16) = G_CONSTANT i16 16383 + ; GFX9: %masklow14vec:_(<2 x s16>) = G_BUILD_VECTOR %masklow14(s16), %masklow14(s16) + ; GFX9: %masked:_(<2 x s16>) = G_AND %narrow, %masklow14vec + ; GFX9: %extend:_(<2 x s32>) = G_ZEXT %masked(<2 x s16>) + ; GFX9: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX9: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX9: %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX9: $vgpr0_vgpr1 = COPY %shl(<2 x s32>) + %narrow:_(<2 x s16>) = COPY $vgpr0 + %masklow14:_(s16) = G_CONSTANT i16 16383 + %masklow14vec:_(<2 x s16>) = G_BUILD_VECTOR %masklow14, %masklow14 + %masked:_(<2 x s16>) = G_AND %narrow, %masklow14vec + %extend:_(<2 x s32>) = G_ZEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt, %shiftamt + %shl:_(<2 x s32>) = G_SHL %extend, %shiftamtvec + $vgpr0_vgpr1 = COPY %shl +... + +--- +name: narrow_shl_v2s64_by_2_from_anyext_v2s32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GFX6-LABEL: name: narrow_shl_v2s64_by_2_from_anyext_v2s32 + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6: %narrow:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX6: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX6: %masklow30vec:_(<2 x s32>) = G_BUILD_VECTOR %masklow30(s32), %masklow30(s32) + ; GFX6: %masked:_(<2 x s32>) = G_AND %narrow, %masklow30vec + ; GFX6: %extend:_(<2 x s64>) = G_ANYEXT %masked(<2 x s32>) + ; GFX6: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX6: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX6: %shl:_(<2 x s64>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX6: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %shl(<2 x s64>) + ; GFX9-LABEL: name: narrow_shl_v2s64_by_2_from_anyext_v2s32 + ; GFX9: liveins: $vgpr0_vgpr1 + ; GFX9: %narrow:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9: %masklow30:_(s32) = G_CONSTANT i32 1073741823 + ; GFX9: %masklow30vec:_(<2 x s32>) = G_BUILD_VECTOR %masklow30(s32), %masklow30(s32) + ; GFX9: %masked:_(<2 x s32>) = G_AND %narrow, %masklow30vec + ; GFX9: %extend:_(<2 x s64>) = G_ANYEXT %masked(<2 x s32>) + ; GFX9: %shiftamt:_(s32) = G_CONSTANT i32 2 + ; GFX9: %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt(s32), %shiftamt(s32) + ; GFX9: %shl:_(<2 x s64>) = G_SHL %extend, %shiftamtvec(<2 x s32>) + ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %shl(<2 x s64>) + %narrow:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %masklow30:_(s32) = G_CONSTANT i32 1073741823 + %masklow30vec:_(<2 x s32>) = G_BUILD_VECTOR %masklow30, %masklow30 + %masked:_(<2 x s32>) = G_AND %narrow, %masklow30vec + %extend:_(<2 x s64>) = G_ANYEXT %masked + %shiftamt:_(s32) = G_CONSTANT i32 2 + %shiftamtvec:_(<2 x s32>) = G_BUILD_VECTOR %shiftamt, %shiftamt + %shl:_(<2 x s64>) = G_SHL %extend, %shiftamtvec + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %shl +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -0,0 +1,562 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; Test optimization to reduce shifts to narrower sizes. + +define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_zext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_andn2_b32 s0, s0, -2.0 +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 1073741823 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_zext_i32(i32 %x) { +; GCN-LABEL: v_shl_i64_zext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x3fffffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 1073741823 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_ps i64 @s_shl_i64_sext_i32(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_sext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_and_b32 s0, s0, 0x1fffffff +; GCN-NEXT: s_lshl_b32 s0, s0, 2 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 536870911 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_sext_i32(i32 %x) { +; GCN-LABEL: v_shl_i64_sext_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 536870911 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_ps i64 @s_shl_i64_zext_i32_overflow(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_zext_i32_overflow: +; GCN: ; %bb.0: +; GCN-NEXT: s_bitset0_b32 s0, 31 +; GCN-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 2147483647 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_zext_i32_overflow(i32 %x) { +; GFX7-LABEL: v_shl_i64_zext_i32_overflow: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_i64_zext_i32_overflow: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_i64_zext_i32_overflow: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 2147483647 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_ps i64 @s_shl_i64_sext_i32_overflow(i32 inreg %x) { +; GCN-LABEL: s_shl_i64_sext_i32_overflow: +; GCN: ; %bb.0: +; GCN-NEXT: s_bitset0_b32 s0, 31 +; GCN-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and i32 %x, 2147483647 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define i64 @v_shl_i64_sext_i32_overflow(i32 %x) { +; GFX7-LABEL: v_shl_i64_sext_i32_overflow: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_i64_sext_i32_overflow: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_i64_sext_i32_overflow: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %x, 2147483647 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + ret i64 %shl +} + +define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) { +; GFX7-LABEL: mulu24_shl64: +; GFX7: ; %bb.0: ; %bb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: mulu24_shl64: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: mulu24_shl64: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_and_b32_e32 v0, 6, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 7, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp1 = and i32 %tmp, 6 + %mulconv = mul nuw nsw i32 %tmp1, 7 + %tmp2 = zext i32 %mulconv to i64 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp2 + store i32 0, i32 addrspace(1)* %tmp3, align 4 + ret void +} + +define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { +; GFX7-LABEL: muli24_shl64: +; GFX7: ; %bb.0: ; %bb +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[0:1], 2 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX7-NEXT: buffer_load_dword v7, v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: v_lshl_b64 v[3:4], v[0:1], 3 +; GFX7-NEXT: v_mov_b32_e32 v6, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v7 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[1:2], 3 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, v5, v3 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: muli24_shl64: +; GFX8: ; %bb.0: ; %bb +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX8-NEXT: flat_load_dword v7, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], 3, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v0, 0xff800000, v7 +; GFX8-NEXT: v_mul_i32_i24_e32 v1, -7, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v3 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: muli24_shl64: +; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: v_lshlrev_b64 v[3:4], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, 0xff800000, v7 +; GFX9-NEXT: v_mul_i32_i24_e32 v1, -7, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp2 = sext i32 %tmp to i64 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp2 + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = or i32 %tmp4, -8388608 + %tmp6 = mul nsw i32 %tmp5, -7 + %tmp7 = zext i32 %tmp6 to i64 + %tmp8 = shl nuw nsw i64 %tmp7, 3 + %tmp9 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp2 + store i64 %tmp8, i64 addrspace(1)* %tmp9, align 8 + ret void +} + +define amdgpu_ps <2 x i64> @s_shl_v2i64_zext_v2i32(<2 x i32> inreg %x) { +; GCN-LABEL: s_shl_v2i64_zext_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_brev_b32 s2, -4 +; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_bfe_u64 s[2:3], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: s_bfe_u64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and <2 x i32> %x, + %ext = zext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) { +; GFX7-LABEL: v_shl_v2i64_zext_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_brev_b32 s4, -4 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_v2i64_zext_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_brev_b32 s4, -4 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_v2i64_zext_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_brev_b32 s4, -4 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and <2 x i32> %x, + %ext = zext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define amdgpu_ps <2 x i64> @s_shl_v2i64_sext_v2i32(<2 x i32> inreg %x) { +; GCN-LABEL: s_shl_v2i64_sext_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_brev_b32 s2, -8 +; GCN-NEXT: s_mov_b32 s3, s2 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x200000 +; GCN-NEXT: s_mov_b32 s0, s1 +; GCN-NEXT: s_bfe_i64 s[4:5], s[0:1], 0x200000 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[4:5], 2 +; GCN-NEXT: ; return to shader part epilog + %and = and <2 x i32> %x, + %ext = sext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) { +; GFX7-LABEL: v_shl_v2i64_sext_v2i32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_brev_b32 s4, -8 +; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX7-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[2:3], v[2:3], 2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_v2i64_sext_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_brev_b32 s4, -8 +; GFX8-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_v2i64_sext_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_brev_b32 s4, -8 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and <2 x i32> %x, + %ext = sext <2 x i32> %and to <2 x i64> + %shl = shl <2 x i64> %ext, + ret <2 x i64> %shl +} + +define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) { +; GFX7-LABEL: s_shl_i32_zext_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX7-NEXT: s_lshl_b32 s0, s0, 2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_i32_zext_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX8-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_i32_zext_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff +; GFX9-NEXT: s_bfe_u32 s1, 2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: ; return to shader part epilog + %and = and i16 %x, 16383 + %ext = zext i16 %and to i32 + %shl = shl i32 %ext, 2 + ret i32 %shl +} + +define i32 @v_shl_i32_zext_i16(i16 %x) { +; GFX7-LABEL: v_shl_i32_zext_i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_i32_zext_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 2, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_i32_zext_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and i16 %x, 16383 + %ext = zext i16 %and to i32 + %shl = shl i32 %ext, 2 + ret i32 %shl +} + +define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { +; GFX7-LABEL: s_shl_v2i32_zext_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_or_b32 s0, s1, s0 +; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff +; GFX7-NEXT: s_lshr_b32 s1, s0, 16 +; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s1, s1, s2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 2 +; GFX7-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_shl_v2i32_zext_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s2, 0x3fff +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_mov_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_mov_b32 s5, s4 +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_shl_v2i32_zext_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff3fff +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_lshl_b32 s1, s1, 2 +; GFX9-NEXT: ; return to shader part epilog + %and = and <2 x i16> %x, + %ext = zext <2 x i16> %and to <2 x i32> + %shl = shl <2 x i32> %ext, + ret <2 x i32> %shl +} + +; FIXME: This doesn't do what we want. The pre-legalizer combiner +; fails to handle the vector splat. The post-legalizer sees the zext +; legalized into the and. This is probably not that important, since +; we really do this combine in the machine level for lowered +; getelementptrs. +define <2 x i32> @v_shl_v2i32_zext_v2i16(<2 x i16> %x) { +; GFX7-LABEL: v_shl_v2i32_zext_v2i16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX7-NEXT: s_mov_b32 s4, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_shl_v2i32_zext_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0xffff +; GFX8-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, 2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_shl_v2i32_zext_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0x3fff3fff, v0 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v1, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %and = and <2 x i16> %x, + %ext = zext <2 x i16> %and to <2 x i32> + %shl = shl <2 x i32> %ext, + ret <2 x i32> %shl +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable willreturn }