Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -340,6 +340,9 @@ LegalizeResult lowerLoad(MachineInstr &MI); LegalizeResult lowerStore(MachineInstr &MI); LegalizeResult lowerBitCount(MachineInstr &MI); + LegalizeResult lowerFunnelShiftWithInverse(MachineInstr &MI); + LegalizeResult lowerFunnelShiftAsShifts(MachineInstr &MI); + LegalizeResult lowerFunnelShift(MachineInstr &MI); LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI); LegalizeResult lowerUITOFP(MachineInstr &MI); Index: llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1438,6 +1438,13 @@ return buildInstr(TargetOpcode::G_SMULH, {Dst}, {Src0, Src1}, Flags); } + /// Build and insert \p Res = G_UREM \p Op0, \p Op1 + MachineInstrBuilder buildURem(const DstOp &Dst, const SrcOp &Src0, + const SrcOp &Src1, + Optional Flags = None) { + return buildInstr(TargetOpcode::G_UREM, {Dst}, {Src0, Src1}, Flags); + } + MachineInstrBuilder buildFMul(const DstOp &Dst, const SrcOp &Src0, const SrcOp &Src1, Optional Flags = None) { Index: llvm/include/llvm/CodeGen/GlobalISel/Utils.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/Utils.h +++ llvm/include/llvm/CodeGen/GlobalISel/Utils.h @@ -328,6 +328,13 @@ Optional getVectorSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI); +/// Attempt to match a unary predicate against a scalar/splat constant or every +/// element of a constant G_BUILD_VECTOR/G_BUILD_VECTOR_TRUNC. +bool matchUnaryPredicate( + const MachineRegisterInfo &MRI, Register Reg, + std::function Match, + bool AllowUndefs); + /// Returns true if given the TargetLowering's boolean contents information, /// the value \p Val contains a true value. bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -3151,6 +3151,9 @@ case G_SDIVREM: case G_UDIVREM: return lowerDIVREM(MI); + case G_FSHL: + case G_FSHR: + return lowerFunnelShift(MI); } } @@ -5096,6 +5099,131 @@ } } +// Check that (every element of) Reg is undef or not an exact multiple of BW. +static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI, + Register Reg, unsigned BW) { + return matchUnaryPredicate( + MRI, Reg, + [=](const MachineRegisterInfo &MRI, Register R) { + Optional C = getConstantVRegVal(R, MRI); + return !C || C->urem(BW) != 0; + }, + true); +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + Register Z = MI.getOperand(3).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShTy = MRI.getType(Z); + + unsigned BW = Ty.getScalarSizeInBits(); + const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; + unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; + + if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { + // fshl X, Y, Z -> fshr X, Y, -Z + // fshr X, Y, Z -> fshl X, Y, -Z + auto Zero = MIRBuilder.buildConstant(ShTy, 0); + Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0); + } else { + // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z + // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z + auto One = MIRBuilder.buildConstant(ShTy, 1); + if (IsFSHL) { + Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); + X = MIRBuilder.buildLShr(Ty, X, One).getReg(0); + } else { + X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0); + Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0); + } + + Z = MIRBuilder.buildNot(ShTy, Z).getReg(0); + } + + MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z}); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + Register Z = MI.getOperand(3).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShTy = MRI.getType(Z); + + const unsigned BW = Ty.getScalarSizeInBits(); + const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; + + Register ShX, ShY; + Register ShAmt, InvShAmt; + + // FIXME: Emit optimized urem by constant instead of letting it expand later. + if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) { + // fshl: X << C | Y >> (BW - C) + // fshr: X << (BW - C) | Y >> C + // where C = Z % BW is not zero + auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); + ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); + InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0); + ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0); + ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0); + } else { + // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW)) + // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW) + auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1); + if (isPowerOf2_32(BW)) { + // Z % BW -> Z & (BW - 1) + ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0); + // (BW - 1) - (Z % BW) -> ~Z & (BW - 1) + auto NotZ = MIRBuilder.buildNot(ShTy, Z); + InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0); + } else { + auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW); + ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0); + InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0); + } + + auto One = MIRBuilder.buildConstant(ShTy, 1); + if (IsFSHL) { + ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0); + auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One); + ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0); + } else { + auto ShX1 = MIRBuilder.buildShl(Ty, X, One); + ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0); + ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0); + } + } + + MIRBuilder.buildOr(Dst, ShX, ShY); + MI.eraseFromParent(); + return Legalized; +} + +LegalizerHelper::LegalizeResult +LegalizerHelper::lowerFunnelShift(MachineInstr &MI) { + // These operations approximately do the following (while avoiding undefined + // shifts by BW): + // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) + // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + LLT ShTy = MRI.getType(MI.getOperand(3).getReg()); + + bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL; + unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL; + if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower) + return lowerFunnelShiftAsShifts(MI); + return lowerFunnelShiftWithInverse(MI); +} + // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float // representation. LegalizerHelper::LegalizeResult Index: llvm/lib/CodeGen/GlobalISel/Utils.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -926,6 +926,37 @@ return RegOrConstant(Reg); } +bool llvm::matchUnaryPredicate( + const MachineRegisterInfo &MRI, Register Reg, + std::function Match, + bool AllowUndefs) { + + const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); + if (AllowUndefs && Def->getOpcode() == TargetOpcode::G_IMPLICIT_DEF) + return true; + + if (Def->getOpcode() == TargetOpcode::G_CONSTANT) + return Match(MRI, Reg); + + if (Def->getOpcode() != TargetOpcode::G_BUILD_VECTOR && + Def->getOpcode() != TargetOpcode::G_BUILD_VECTOR_TRUNC) + return false; + + for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) { + Register SrcElt = Def->getOperand(I).getReg(); + if (AllowUndefs) { + const MachineInstr *SrcDef = getDefIgnoringCopies(SrcElt, MRI); + if (SrcDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF) + continue; + } + + if (!Match(MRI, SrcElt)) + return false; + } + + return true; +} + bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector, bool IsFP) { switch (TLI.getBooleanContents(IsVector, IsFP)) { Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1595,11 +1595,26 @@ .clampScalar(0, S32, S64) .lower(); + // TODO: Only Try to form v2s16 with legal packed instructions. getActionDefinitionsBuilder(G_FSHR) .legalFor({{S32, S32}}) + .lowerFor({{V2S16, V2S16}}) + .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) .scalarize(0) .lower(); + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder(G_FSHL) + .lowerFor({{V2S16, V2S16}}) + .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) + .scalarize(0) + .lower(); + } else { + getActionDefinitionsBuilder(G_FSHL) + .scalarize(0) + .lower(); + } + getActionDefinitionsBuilder(G_READCYCLECOUNTER) .legalFor({S64}); @@ -1619,9 +1634,7 @@ G_SADDO, G_SSUBO, // TODO: Implement - G_FMINIMUM, G_FMAXIMUM, - G_FSHL - }).lower(); + G_FMINIMUM, G_FMAXIMUM}).lower(); getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -0,0 +1,7474 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s + +define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { +; GFX6-LABEL: s_fshl_i7: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_sub_i32 s3, 0, 7 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_movk_i32 s3, 0x7f +; GFX6-NEXT: s_and_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i7: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_sub_i32 s3, 0, 7 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX8-NEXT: s_movk_i32 s3, 0x7f +; GFX8-NEXT: s_and_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i7: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s3, 0, 7 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: s_movk_i32 s3, 0x7f +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX10-NEXT: s_sub_i32 s3, 0, 7 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: s_movk_i32 s3, 0x7f +; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u16_e64 v1, 6, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) + ret i7 %result +} + +define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { +; GFX6-LABEL: v_fshl_i7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX6-NEXT: s_sub_i32 s4, 0, 7 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX8-NEXT: s_sub_i32 s4, 0, 7 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_i32 s4, 0, 7 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f +; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX10-NEXT: s_sub_i32 s4, 0, 7 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f +; GFX10-NEXT: v_sub_nc_u16_e64 v4, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt) + ret i7 %result +} + +define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { +; GFX6-LABEL: s_fshl_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s3, s2, 7 +; GFX6-NEXT: s_andn2_b32 s2, 7, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s3, s2, 7 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s3, s2, 7 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s3, s2, 7 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt) + ret i8 %result +} + +define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { +; GFX6-LABEL: v_fshl_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt) + ret i8 %result +} + +define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { +; GFX6-LABEL: s_fshl_i8_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_lshl_b32 s0, s0, 4 +; GFX6-NEXT: s_lshr_b32 s1, s1, 4 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i8_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 4 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i8_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 4 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i8_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s1, 4 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) + ret i8 %result +} + +define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_fshl_i8_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 4, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i8_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 4 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i8_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 4 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i8_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 4, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 4, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) + ret i8 %result +} + +define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { +; GFX6-LABEL: s_fshl_i8_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_lshl_b32 s0, s0, 5 +; GFX6-NEXT: s_lshr_b32 s1, s1, 3 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i8_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 5 +; GFX8-NEXT: s_lshr_b32 s1, s1, 3 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i8_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 5 +; GFX9-NEXT: s_lshr_b32 s1, s1, 3 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i8_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, 5 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s1, 3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) + ret i8 %result +} + +define i8 @v_fshl_i8_5(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_fshl_i8_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i8_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 5, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i8_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 3 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 5, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i8_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 5, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) + ret i8 %result +} + +define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 inreg %amt.arg) { +; GFX6-LABEL: s_fshl_v2i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s5, s2, 7 +; GFX6-NEXT: s_movk_i32 s6, 0xff +; GFX6-NEXT: s_lshr_b32 s3, s0, 8 +; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_and_b32 s5, s1, s6 +; GFX6-NEXT: s_lshr_b32 s4, s2, 8 +; GFX6-NEXT: s_andn2_b32 s2, 7, s2 +; GFX6-NEXT: s_lshr_b32 s5, s5, 1 +; GFX6-NEXT: s_lshr_b32 s1, s1, 9 +; GFX6-NEXT: s_lshr_b32 s2, s5, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s4, 7 +; GFX6-NEXT: s_andn2_b32 s4, 7, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0x7f +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, s4 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v2i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s6, s2, 7 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_movk_i32 s6, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s1, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s5, s2, 8 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s5, 7 +; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_and_b32 s3, s4, s6 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 7, s5 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v2i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s6, s2, 7 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s1, 8 +; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshr_b32 s5, s2, 8 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s5, 7 +; GFX9-NEXT: s_lshl_b32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s3, s4, s6 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 7, s5 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v2i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_movk_i32 s6, 0xff +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s7, s2, 7 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s7, s5, 7 +; GFX10-NEXT: s_andn2_b32 s5, 7, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: s_lshr_b32 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s4 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s2, s6 +; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX10-NEXT: s_and_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %lhs = bitcast i16 %lhs.arg to <2 x i8> + %rhs = bitcast i16 %rhs.arg to <2 x i8> + %amt = bitcast i16 %amt.arg to <2 x i8> + %result = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt) + %cast.result = bitcast <2 x i8> %result to i16 + ret i16 %cast.result +} + +define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { +; GFX6-LABEL: v_fshl_v2i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v6, 0xff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 1, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 9, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v5, 0xff +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_lshrrev_b16_e64 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, v3, v5 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v7, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_setpc_b64 s[30:31] + %lhs = bitcast i16 %lhs.arg to <2 x i8> + %rhs = bitcast i16 %rhs.arg to <2 x i8> + %amt = bitcast i16 %amt.arg to <2 x i8> + %result = call <2 x i8> @llvm.fshl.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt) + %cast.result = bitcast <2 x i8> %result to i16 + ret i16 %cast.result +} + +define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 inreg %amt.arg) { +; GFX6-LABEL: s_fshl_v4i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s9, s2, 7 +; GFX6-NEXT: s_movk_i32 s10, 0xff +; GFX6-NEXT: s_lshr_b32 s3, s0, 8 +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 +; GFX6-NEXT: s_lshr_b32 s5, s0, 24 +; GFX6-NEXT: s_lshl_b32 s0, s0, s9 +; GFX6-NEXT: s_and_b32 s9, s1, s10 +; GFX6-NEXT: s_lshr_b32 s6, s2, 8 +; GFX6-NEXT: s_lshr_b32 s7, s2, 16 +; GFX6-NEXT: s_lshr_b32 s8, s2, 24 +; GFX6-NEXT: s_andn2_b32 s2, 7, s2 +; GFX6-NEXT: s_lshr_b32 s9, s9, 1 +; GFX6-NEXT: s_lshr_b32 s2, s9, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s6, 7 +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_lshr_b32 s3, s1, 9 +; GFX6-NEXT: s_movk_i32 s9, 0x7f +; GFX6-NEXT: s_andn2_b32 s6, 7, s6 +; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s7, 7 +; GFX6-NEXT: s_lshl_b32 s3, s4, s3 +; GFX6-NEXT: s_lshr_b32 s4, s1, 17 +; GFX6-NEXT: s_andn2_b32 s6, 7, s7 +; GFX6-NEXT: s_and_b32 s4, s4, s9 +; GFX6-NEXT: s_lshr_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s2, s10 +; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s4, s8, 7 +; GFX6-NEXT: s_andn2_b32 s6, 7, s8 +; GFX6-NEXT: s_lshr_b32 s1, s1, 25 +; GFX6-NEXT: s_lshl_b32 s4, s5, s4 +; GFX6-NEXT: s_lshr_b32 s1, s1, s6 +; GFX6-NEXT: s_and_b32 s0, s0, s10 +; GFX6-NEXT: s_lshl_b32 s2, s2, 8 +; GFX6-NEXT: s_or_b32 s1, s4, s1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s3, s10 +; GFX6-NEXT: s_and_b32 s1, s1, s10 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v4i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s13, 0xff +; GFX8-NEXT: s_lshr_b32 s6, s1, 8 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s9, s2, 8 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_and_b32 s12, s2, 7 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, s12 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s9, 7 +; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 7, s9 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s10, 7 +; GFX8-NEXT: s_lshl_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_andn2_b32 s3, 7, s10 +; GFX8-NEXT: s_lshr_b32 s4, s4, 1 +; GFX8-NEXT: s_lshr_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s3, s11, 7 +; GFX8-NEXT: s_lshl_b32 s3, s5, s3 +; GFX8-NEXT: s_andn2_b32 s4, 7, s11 +; GFX8-NEXT: s_lshr_b32 s5, s8, 1 +; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_lshr_b32 s4, s5, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s13, 0xff +; GFX9-NEXT: s_lshr_b32 s6, s1, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NEXT: s_lshr_b32 s8, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: s_lshr_b32 s10, s2, 16 +; GFX9-NEXT: s_lshr_b32 s11, s2, 24 +; GFX9-NEXT: s_and_b32 s12, s2, 7 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, s12 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s9, 7 +; GFX9-NEXT: s_lshl_b32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 7, s9 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s10, 7 +; GFX9-NEXT: s_lshl_b32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_andn2_b32 s3, 7, s10 +; GFX9-NEXT: s_lshr_b32 s4, s4, 1 +; GFX9-NEXT: s_lshr_b32 s3, s4, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s11, 7 +; GFX9-NEXT: s_lshl_b32 s3, s5, s3 +; GFX9-NEXT: s_andn2_b32 s4, 7, s11 +; GFX9-NEXT: s_lshr_b32 s5, s8, 1 +; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_lshr_b32 s4, s5, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v4i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s11, 0xff +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 +; GFX10-NEXT: s_lshr_b32 s12, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 7 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s2, s6, s11 +; GFX10-NEXT: s_and_b32 s6, s9, 7 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, s13 +; GFX10-NEXT: s_lshr_b32 s2, s2, s9 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s1, s3, s2 +; GFX10-NEXT: s_and_b32 s2, s7, s11 +; GFX10-NEXT: s_and_b32 s3, s10, 7 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_andn2_b32 s6, 7, s10 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: s_lshr_b32 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s4, s12, 7 +; GFX10-NEXT: s_andn2_b32 s6, 7, s12 +; GFX10-NEXT: s_lshr_b32 s7, s8, 1 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_lshl_b32 s4, s5, s4 +; GFX10-NEXT: s_lshr_b32 s5, s7, s6 +; GFX10-NEXT: s_and_b32 s0, s0, s11 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s2, s11 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s3, s11 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %lhs = bitcast i32 %lhs.arg to <4 x i8> + %rhs = bitcast i32 %rhs.arg to <4 x i8> + %amt = bitcast i32 %amt.arg to <4 x i8> + %result = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt) + %cast.result = bitcast <4 x i8> %result to i32 + ret i32 %cast.result +} + +define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { +; GFX6-LABEL: v_fshl_v4i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v10, 0xff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 1, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v10 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 9, v1 +; GFX6-NEXT: s_movk_i32 s4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v7 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 17, v1 +; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_mov_b32_e32 v9, 0xff +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 +; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v9 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v4i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX8-NEXT: v_mov_b32_e32 v9, 0xff +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX8-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX9-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, v8, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v8, v3 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX9-NEXT: v_and_b32_sdwa v6, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_lshrrev_b16_e32 v6, 1, v6 +; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v6 +; GFX9-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v6, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v4i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v11, v0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX10-NEXT: v_mov_b32_e32 v15, 0xff +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b16_e64 v3, v8, v3 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b16_e64 v6, 1, v6 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX10-NEXT: v_lshrrev_b16_e64 v7, 1, v7 +; GFX10-NEXT: v_lshrrev_b16_e64 v6, v11, v6 +; GFX10-NEXT: v_lshlrev_b16_e64 v2, v2, v5 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_lshrrev_b16_e64 v12, 1, v12 +; GFX10-NEXT: v_lshrrev_b16_e64 v5, v13, v7 +; GFX10-NEXT: v_lshlrev_b16_e64 v4, v9, v4 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v8, v1 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX10-NEXT: v_lshrrev_b16_e64 v7, v10, v12 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %lhs = bitcast i32 %lhs.arg to <4 x i8> + %rhs = bitcast i32 %rhs.arg to <4 x i8> + %amt = bitcast i32 %amt.arg to <4 x i8> + %result = call <4 x i8> @llvm.fshl.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt) + %cast.result = bitcast <4 x i8> %result to i32 + ret i32 %cast.result +} + +define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) { +; GFX6-LABEL: s_fshl_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_sub_i32 s3, 0, 24 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xffffff +; GFX6-NEXT: s_and_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_lshr_b32_e32 v1, s1, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_sub_i32 s3, 0, 24 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX8-NEXT: s_mov_b32 s3, 0xffffff +; GFX8-NEXT: s_and_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s3, 0, 24 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xffffff +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 +; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX10-NEXT: s_sub_i32 s3, 0, 24 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) + ret i24 %result +} + +define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { +; GFX6-LABEL: v_fshl_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX6-NEXT: s_sub_i32 s4, 0, 24 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX8-NEXT: s_sub_i32 s4, 0, 24 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_i32 s4, 0, 24 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX10-NEXT: s_sub_i32 s4, 0, 24 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) + ret i24 %result +} + +define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) { +; GFX6-LABEL: s_fshl_v2i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshr_b32 s6, s0, 8 +; GFX6-NEXT: s_movk_i32 s10, 0xff +; GFX6-NEXT: s_and_b32 s6, s6, s10 +; GFX6-NEXT: s_lshr_b32 s7, s0, 16 +; GFX6-NEXT: s_lshr_b32 s8, s0, 24 +; GFX6-NEXT: s_and_b32 s0, s0, s10 +; GFX6-NEXT: s_lshl_b32 s6, s6, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s6, s7, s10 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_lshr_b32 s9, s1, 8 +; GFX6-NEXT: s_and_b32 s1, s1, s10 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s6, s9, s10 +; GFX6-NEXT: s_or_b32 s1, s8, s1 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s6 +; GFX6-NEXT: s_lshr_b32 s6, s2, 8 +; GFX6-NEXT: s_and_b32 s6, s6, s10 +; GFX6-NEXT: s_lshr_b32 s7, s2, 16 +; GFX6-NEXT: s_lshr_b32 s8, s2, 24 +; GFX6-NEXT: s_and_b32 s2, s2, s10 +; GFX6-NEXT: s_lshl_b32 s6, s6, 8 +; GFX6-NEXT: s_or_b32 s2, s2, s6 +; GFX6-NEXT: s_and_b32 s6, s7, s10 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_lshr_b32 s9, s3, 8 +; GFX6-NEXT: s_and_b32 s3, s3, s10 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX6-NEXT: s_or_b32 s2, s2, s6 +; GFX6-NEXT: s_and_b32 s6, s9, s10 +; GFX6-NEXT: s_or_b32 s3, s8, s3 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s6 +; GFX6-NEXT: s_lshr_b32 s6, s4, 8 +; GFX6-NEXT: s_and_b32 s6, s6, s10 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: s_lshr_b32 s7, s4, 16 +; GFX6-NEXT: s_lshr_b32 s8, s4, 24 +; GFX6-NEXT: s_and_b32 s4, s4, s10 +; GFX6-NEXT: s_lshl_b32 s6, s6, 8 +; GFX6-NEXT: s_or_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s6, s7, s10 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s6, s6, 16 +; GFX6-NEXT: s_or_b32 s4, s4, s6 +; GFX6-NEXT: s_sub_i32 s6, 0, 24 +; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX6-NEXT: s_lshr_b32 s9, s5, 8 +; GFX6-NEXT: s_and_b32 s5, s5, s10 +; GFX6-NEXT: s_lshl_b32 s5, s5, 8 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_and_b32 s7, s9, s10 +; GFX6-NEXT: s_or_b32 s5, s8, s5 +; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_lshl_b32 s7, s7, 16 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX6-NEXT: s_or_b32 s5, s5, s7 +; GFX6-NEXT: s_mov_b32 s7, 0xffffff +; GFX6-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX6-NEXT: s_lshr_b32 s0, s2, 1 +; GFX6-NEXT: v_and_b32_e32 v2, s7, v2 +; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_lshr_b32 s0, s3, 1 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 +; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s10, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s10, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s10, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s10, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v2i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s6, s0, 8 +; GFX8-NEXT: s_movk_i32 s10, 0xff +; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX8-NEXT: s_lshr_b32 s7, s0, 16 +; GFX8-NEXT: s_lshr_b32 s8, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_lshr_b32 s9, s1, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_lshl_b32 s1, s1, s11 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_or_b32 s1, s8, s1 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s6 +; GFX8-NEXT: s_lshr_b32 s6, s2, 8 +; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_lshr_b32 s8, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_lshr_b32 s9, s3, 8 +; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_lshl_b32 s3, s3, s11 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_or_b32 s3, s8, s3 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s6, s4, 8 +; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: s_lshr_b32 s7, s4, 16 +; GFX8-NEXT: s_lshr_b32 s8, s4, 24 +; GFX8-NEXT: s_and_b32 s4, s4, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_or_b32 s4, s4, s6 +; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s4, s4, s6 +; GFX8-NEXT: s_sub_i32 s6, 0, 24 +; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 +; GFX8-NEXT: s_lshr_b32 s9, s5, 8 +; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_and_b32 s7, s9, s10 +; GFX8-NEXT: s_or_b32 s5, s8, s5 +; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_lshl_b32 s7, s7, 16 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX8-NEXT: s_or_b32 s5, s5, s7 +; GFX8-NEXT: s_mov_b32 s7, 0xffffff +; GFX8-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s0 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_and_b32_e32 v2, s7, v2 +; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s10, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s7, s0, 8 +; GFX9-NEXT: s_movk_i32 s11, 0xff +; GFX9-NEXT: s_and_b32 s7, s7, s11 +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_lshr_b32 s9, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s11 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_and_b32 s7, s8, s11 +; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_lshr_b32 s10, s1, 8 +; GFX9-NEXT: s_and_b32 s1, s1, s11 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_and_b32 s7, s10, s11 +; GFX9-NEXT: s_or_b32 s1, s9, s1 +; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s7 +; GFX9-NEXT: s_lshr_b32 s7, s2, 8 +; GFX9-NEXT: s_and_b32 s7, s7, s11 +; GFX9-NEXT: s_lshr_b32 s8, s2, 16 +; GFX9-NEXT: s_lshr_b32 s9, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, s11 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_and_b32 s7, s8, s11 +; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_lshr_b32 s10, s3, 8 +; GFX9-NEXT: s_and_b32 s3, s3, s11 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_and_b32 s7, s10, s11 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX9-NEXT: s_or_b32 s3, s9, s3 +; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s7 +; GFX9-NEXT: s_lshr_b32 s7, s4, 8 +; GFX9-NEXT: s_and_b32 s7, s7, s11 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 +; GFX9-NEXT: s_lshr_b32 s9, s4, 24 +; GFX9-NEXT: s_and_b32 s4, s4, s11 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s7, s8, s11 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_lshl_b32 s7, s7, 16 +; GFX9-NEXT: s_or_b32 s4, s4, s7 +; GFX9-NEXT: s_sub_i32 s7, 0, 24 +; GFX9-NEXT: v_mul_lo_u32 v1, s7, v0 +; GFX9-NEXT: s_lshr_b32 s10, s5, 8 +; GFX9-NEXT: s_and_b32 s5, s5, s11 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: s_and_b32 s8, s10, s11 +; GFX9-NEXT: s_or_b32 s5, s9, s5 +; GFX9-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX9-NEXT: s_lshl_b32 s8, s8, 16 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_mov_b32 s8, 0xffffff +; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX9-NEXT: v_sub_u32_e32 v2, 23, v1 +; GFX9-NEXT: s_lshr_b32 s0, s3, 1 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s11, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v4 +; GFX9-NEXT: v_and_or_b32 v1, v3, s11, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v2i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX10-NEXT: s_movk_i32 s8, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s1, 8 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s1, s1, s8 +; GFX10-NEXT: s_lshr_b32 s9, s0, 24 +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_sub_i32 s9, 0, 24 +; GFX10-NEXT: s_and_b32 s6, s6, s8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_and_b32 s0, s0, s8 +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_lshr_b32 s12, s4, 24 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s7, s8 +; GFX10-NEXT: s_lshr_b32 s7, s4, 8 +; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s9, v1 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_and_b32 s7, s7, s8 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s11, s8 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_lshl_b32 s7, s7, s10 +; GFX10-NEXT: s_and_b32 s9, s11, s8 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_bfe_u32 s7, s9, 0x100000 +; GFX10-NEXT: s_lshr_b32 s13, s5, 8 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX10-NEXT: s_and_b32 s5, s5, s8 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s13, s8 +; GFX10-NEXT: s_or_b32 s5, s12, s5 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: s_lshr_b32 s7, s2, 8 +; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_lshr_b32 s12, s3, 8 +; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s4, s7, s8 +; GFX10-NEXT: s_and_b32 s7, s9, s8 +; GFX10-NEXT: s_lshl_b32 s4, s4, s10 +; GFX10-NEXT: s_or_b32 s3, s11, s3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_bfe_u32 s4, s7, 0x100000 +; GFX10-NEXT: s_mov_b32 s5, 0xffffff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_and_b32 s4, s12, s8 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX10-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s4, 16 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX10-NEXT: s_lshr_b32 s0, s2, 1 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v4, s0 +; GFX10-NEXT: s_or_b32 s0, s1, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, s0, v1, v2 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v3, s8, v1 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, s8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s8, v2 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s8, v4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog + %lhs = bitcast i48 %lhs.arg to <2 x i24> + %rhs = bitcast i48 %rhs.arg to <2 x i24> + %amt = bitcast i48 %amt.arg to <2 x i24> + %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) + %cast.result = bitcast <2 x i24> %result to i48 + ret i48 %cast.result +} + +define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { +; GFX6-LABEL: v_fshl_v2i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX6-NEXT: s_sub_i32 s4, 0, 24 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v8, 24 +; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_mul_lo_u32 v7, s4, v6 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX6-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX6-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s4, v7 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 23, v4 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_mul_hi_u32 v6, v7, v6 +; GFX6-NEXT: v_and_b32_e32 v4, v5, v8 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v6 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 +; GFX6-NEXT: v_and_b32_e32 v6, v9, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, v5, 24 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v5 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v2, v3, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX8-NEXT: s_sub_i32 s4, 0, 24 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v8, 24 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_mul_lo_u32 v7, s4, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX8-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s4, v7 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 23, v4 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_mul_hi_u32 v6, v7, v6 +; GFX8-NEXT: v_and_b32_e32 v4, v5, v8 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v5, v4, v5 +; GFX8-NEXT: v_and_b32_e32 v6, v9, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_mul_lo_u32 v5, v5, 24 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v4, v5 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v2, v3, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX9-NEXT: s_sub_i32 s4, 0, 24 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v8, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX9-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffffff +; GFX9-NEXT: v_mul_lo_u32 v7, s4, v6 +; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, s4, v7 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 +; GFX9-NEXT: v_and_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX10-NEXT: s_sub_i32 s4, 0, 24 +; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v12 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX10-NEXT: v_mul_lo_u32 v8, s4, v6 +; GFX10-NEXT: v_mul_lo_u32 v9, s4, v7 +; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v11, v6, v12 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX10-NEXT: v_and_b32_e32 v10, v5, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v11, v2 +; GFX10-NEXT: v_and_b32_e32 v6, v7, v12 +; GFX10-NEXT: v_and_b32_e32 v7, v15, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v6, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v7, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v10, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) + ret <2 x i24> %result +} + +define amdgpu_ps i32 @s_fshl_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { +; GFX6-LABEL: s_fshl_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_not_b32 s1, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_not_b32 s1, s2 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_not_b32 s1, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) + ret i32 %result +} + +define amdgpu_ps i32 @s_fshl_i32_5(i32 inreg %lhs, i32 inreg %rhs) { +; GFX6-LABEL: s_fshl_i32_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, -5 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i32_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, -5 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i32_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, -5 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i32_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, -5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5) + ret i32 %result +} + +define amdgpu_ps i32 @s_fshl_i32_8(i32 inreg %lhs, i32 inreg %rhs) { +; GFX6-LABEL: s_fshl_i32_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, -8 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i32_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, -8 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i32_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, -8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i32_8: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, -8 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8) + ret i32 %result +} + +define i32 @v_fshl_i32(i32 %lhs, i32 %rhs, i32 %amt) { +; GFX6-LABEL: v_fshl_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v1, v0, v1, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) + ret i32 %result +} + +define i32 @v_fshl_i32_5(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_fshl_i32_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, -5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i32_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, -5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i32_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, -5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i32_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, -5 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 5) + ret i32 %result +} + +define i32 @v_fshl_i32_8(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_fshl_i32_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, -8 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i32_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, -8 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i32_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, -8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i32_8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, -8 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 8) + ret i32 %result +} + +define amdgpu_ps float @v_fshl_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) { +; GFX6-LABEL: v_fshl_i32_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v1, s0, v1, 1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i32_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v1, s0, v1, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i32_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i32_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v1, s0, s1, 1 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) + %cast.result = bitcast i32 %result to float + ret float %cast.result +} + +define amdgpu_ps float @v_fshl_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) { +; GFX6-LABEL: v_fshl_i32_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_not_b32 s1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i32_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_not_b32 s1, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i32_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_not_b32 s1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i32_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) + %cast.result = bitcast i32 %result to float + ret float %cast.result +} + +define amdgpu_ps float @v_fshl_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { +; GFX6-LABEL: v_fshl_i32_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: s_not_b32 s1, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i32_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_not_b32 s1, s2 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: s_not_b32 s1, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_not_b32 s1, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshl.i32(i32 %lhs, i32 %rhs, i32 %amt) + %cast.result = bitcast i32 %result to float + ret float %cast.result +} + +define <2 x i32> @v_fshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { +; GFX6-LABEL: v_fshl_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX6-NEXT: v_alignbit_b32 v2, v1, v3, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX8-NEXT: v_alignbit_b32 v2, v1, v3, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_alignbit_b32 v2, v1, v3, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v2, v0, v2, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX10-NEXT: v_alignbit_b32 v3, v1, v3, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX10-NEXT: v_alignbit_b32 v0, v7, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v6, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) + ret <2 x i32> %result +} + +define <3 x i32> @v_fshl_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) { +; GFX6-LABEL: v_fshl_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX6-NEXT: v_alignbit_b32 v3, v1, v4, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, v4 +; GFX6-NEXT: v_alignbit_b32 v3, v2, v5, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v3, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX8-NEXT: v_alignbit_b32 v3, v1, v4, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, v4 +; GFX8-NEXT: v_alignbit_b32 v3, v2, v5, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX9-NEXT: v_alignbit_b32 v3, v1, v4, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v4 +; GFX9-NEXT: v_alignbit_b32 v3, v2, v5, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v8 +; GFX9-NEXT: v_alignbit_b32 v2, v2, v3, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v3, v0, v3, 1 +; GFX10-NEXT: v_alignbit_b32 v4, v1, v4, 1 +; GFX10-NEXT: v_alignbit_b32 v5, v2, v5, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i32> @llvm.fshl.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) + ret <3 x i32> %result +} + +define <4 x i32> @v_fshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { +; GFX6-LABEL: v_fshl_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX6-NEXT: v_alignbit_b32 v4, v1, v5, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v4, v5 +; GFX6-NEXT: v_alignbit_b32 v4, v2, v6, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v4, v5 +; GFX6-NEXT: v_alignbit_b32 v4, v3, v7, 1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v4, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX8-NEXT: v_alignbit_b32 v4, v1, v5, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v2, v6, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v4, v5 +; GFX8-NEXT: v_alignbit_b32 v4, v3, v7, 1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v4, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v4, v0, v4, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX9-NEXT: v_alignbit_b32 v4, v1, v5, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v9 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_alignbit_b32 v4, v2, v6, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX9-NEXT: v_alignbit_b32 v2, v2, v4, v5 +; GFX9-NEXT: v_alignbit_b32 v4, v3, v7, 1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX9-NEXT: v_alignbit_b32 v3, v3, v4, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v22, v1, v5, 1 +; GFX10-NEXT: v_alignbit_b32 v18, v0, v4, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v15, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v19, 1, v1 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX10-NEXT: v_alignbit_b32 v5, v2, v6, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v23, 1, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v10 +; GFX10-NEXT: v_alignbit_b32 v13, v3, v7, 1 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 1, v3 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 +; GFX10-NEXT: v_alignbit_b32 v0, v15, v18, v8 +; GFX10-NEXT: v_alignbit_b32 v1, v19, v22, v9 +; GFX10-NEXT: v_alignbit_b32 v2, v23, v5, v10 +; GFX10-NEXT: v_alignbit_b32 v3, v14, v13, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) + ret <4 x i32> %result +} + +define amdgpu_ps i16 @s_fshl_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt) { +; GFX6-LABEL: s_fshl_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s3, s2, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s3, s2, 15 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s3, s2, 15 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 15, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s3, s2, 15 +; GFX10-NEXT: s_andn2_b32 s2, 15, s2 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) + ret i16 %result +} + +define amdgpu_ps i16 @s_fshl_i16_4(i16 inreg %lhs, i16 inreg %rhs) { +; GFX6-LABEL: s_fshl_i16_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 4 +; GFX6-NEXT: s_lshr_b32 s1, s1, 12 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i16_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s2, 4, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, 12, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i16_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s2, 4, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s2, 12, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i16_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_u32 s2, 4, 0x100000 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, 12, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4) + ret i16 %result +} + +define amdgpu_ps i16 @s_fshl_i16_5(i16 inreg %lhs, i16 inreg %rhs) { +; GFX6-LABEL: s_fshl_i16_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 5 +; GFX6-NEXT: s_lshr_b32 s1, s1, 11 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i16_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s2, 5, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i16_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s2, 5, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i16_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_u32 s2, 5, 0x100000 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, 11, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5) + ret i16 %result +} + +define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) { +; GFX6-LABEL: v_fshl_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) + ret i16 %result +} + +define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_fshl_i16_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 12, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i16_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 12, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i16_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 12, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i16_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 4, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 12, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4) + ret i16 %result +} + +define i16 @v_fshl_i16_5(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_fshl_i16_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i16_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 5, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 11, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i16_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 5, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 11, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i16_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 5, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 11, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5) + ret i16 %result +} + +define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) { +; GFX6-LABEL: v_fshl_i16_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i16_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i16_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i16_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) + %cast.result = bitcast i16 %result to half + ret half %cast.result +} + +define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt) { +; GFX6-LABEL: v_fshl_i16_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s2, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i16_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i16_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: v_lshrrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_lshrrev_b16_e32 v0, s1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i16_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: s_andn2_b32 s2, 15, s1 +; GFX10-NEXT: s_and_b32 s1, s1, 15 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, s2, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) + %cast.result = bitcast i16 %result to half + ret half %cast.result +} + +define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) { +; GFX6-LABEL: v_fshl_i16_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s2, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i16_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, s2, v0 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i16_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, s2, v0 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i16_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, s1, 15 +; GFX10-NEXT: s_andn2_b32 s1, 15, s1 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, s2, v0 +; GFX10-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt) + %cast.result = bitcast i16 %result to half + ret half %cast.result +} + +define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { +; GFX6-LABEL: s_fshl_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s5, s2, 15 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_lshr_b32 s4, s2, 16 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_and_b32 s5, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s5, s5, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s5, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s2, s4, 15 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_andn2_b32 s4, 15, s4 +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, 17 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX6-NEXT: s_lshr_b32 s1, s1, s3 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s6, s2, 15 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_lshr_b32 s5, s2, 16 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s6, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s6 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s5, 15 +; GFX8-NEXT: s_andn2_b32 s2, 15, s5 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_lshr_b32 s3, s4, s6 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000f +; GFX9-NEXT: s_and_b32 s4, s2, s3 +; GFX9-NEXT: s_andn2_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, s5 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_mov_b32 s3, 0xf000f +; GFX10-NEXT: s_and_b32 s7, s1, s5 +; GFX10-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s7, s7, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s4, s2, s3 +; GFX10-NEXT: s_andn2_b32 s2, s3, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_and_b32 s5, s2, s5 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s5 +; GFX10-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to i32 + ret i32 %cast +} + +define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { +; GFX6-LABEL: v_fshl_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v4, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + ret <2 x i16> %result +} + +define <2 x i16> @v_fshl_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { +; GFX6-LABEL: v_fshl_v2i16_4_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i16_4_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 11, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 7, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i16_4_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_i32 s4, 0, 16 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v2, 4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, 4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 8, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i16_4_8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 +; GFX10-NEXT: s_sub_i32 s4, 0, 16 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX10-NEXT: v_mul_hi_u32 v2, 8, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, 4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 8, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 4, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v3, 0xffff, v2 +; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> ) + ret <2 x i16> %result +} + +define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) { +; GFX6-LABEL: v_fshl_v2i16_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: s_lshr_b32 s0, s1, 17 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_v2i16_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, 1, 0x100000 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: s_lshr_b32 s0, s3, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_v2i16_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v1, s2, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: s_lshr_b32 s0, s1, 16 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_v2i16_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to float + ret float %cast +} + +define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { +; GFX6-LABEL: v_fshl_v2i16_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; GFX6-NEXT: s_and_b32 s4, s1, 15 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_and_b32 s0, s3, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s3 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 17, v0 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s2, s0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_v2i16_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, s1, 15 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, s1, v1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 +; GFX8-NEXT: s_lshl_b32 s0, s2, s0 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_v2i16_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, 0xf000f +; GFX9-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s1, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_v2i16_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_and_b32 s3, s1, s2 +; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, s1, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s1, s2, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to float + ret float %cast +} + +define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { +; GFX6-LABEL: v_fshl_v2i16_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s3, s1, 15 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_lshr_b32 s2, s1, 16 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_and_b32 s3, s0, 0xffff +; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshr_b32 s1, s3, s1 +; GFX6-NEXT: v_or_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s1, s2, 15 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s1, v1 +; GFX6-NEXT: s_lshr_b32 s0, s0, 17 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_v2i16_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s4, s1, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s4, v0 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, s4 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_v2i16_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, 0xf000f +; GFX9-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xffff +; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s0, s0, 1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: s_lshr_b32 s1, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_v2i16_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_and_b32 s5, s0, s3 +; GFX10-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s5, 1 +; GFX10-NEXT: s_lshr_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s4, s1, s2 +; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s5, s0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s3, s1, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s1, s2, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to float + ret float %cast +} + +; ; FIXME +; define amdgpu_ps i48 @s_fshl_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { +; %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) +; %cast = bitcast <3 x i16> %result to i48 +; ret i48 %cast +; } + +; ; FIXME +; define <3 x half> @v_fshl_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) { +; %result = call <3 x i16> @llvm.fshl.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) +; %cast.result = bitcast <3 x i16> %result to <3 x half> +; ret <3 x half> %cast.result +; } + +define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { +; GFX6-LABEL: s_fshl_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s12, s8, 15 +; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s12 +; GFX6-NEXT: s_mov_b32 s12, 0xffff +; GFX6-NEXT: s_andn2_b32 s8, 15, s8 +; GFX6-NEXT: s_and_b32 s4, s4, s12 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, s8 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s4, s9, 15 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s5, s12 +; GFX6-NEXT: s_andn2_b32 s8, 15, s9 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s10, 15 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_andn2_b32 s5, 15, s10 +; GFX6-NEXT: s_lshl_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s4, s6, s12 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_or_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s4, s11, 15 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_andn2_b32 s5, 15, s11 +; GFX6-NEXT: s_lshl_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s4, s7, s12 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s12, s4, 15 +; GFX8-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s8, s2, 16 +; GFX8-NEXT: s_lshr_b32 s10, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s12 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_bfe_u32 s12, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s12 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s4 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s10, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s10 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s6, s2 +; GFX8-NEXT: s_lshr_b32 s6, s8, s12 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s4, s6, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: s_lshr_b32 s9, s3, 16 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s11, s5, 16 +; GFX8-NEXT: s_andn2_b32 s5, 15, s5 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s12 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s3, s11, 15 +; GFX8-NEXT: s_andn2_b32 s4, 15, s11 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s5, s9, s12 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s3, s7, s3 +; GFX8-NEXT: s_lshr_b32 s4, s5, s4 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s6, 0xf000f +; GFX9-NEXT: s_and_b32 s7, s4, s6 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_lshr_b32 s9, s7, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s7 +; GFX9-NEXT: s_lshl_b32 s7, s8, s9 +; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, s8 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_lshr_b32 s7, s7, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s7 +; GFX9-NEXT: s_andn2_b32 s4, s6, s4 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_lshr_b32 s9, s4, 16 +; GFX9-NEXT: s_and_b32 s2, s2, s8 +; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s7, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s5, s6 +; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s2, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, 16 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_and_b32 s2, s2, s8 +; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s9, 0xffff +; GFX10-NEXT: s_mov_b32 s6, 0xf000f +; GFX10-NEXT: s_and_b32 s11, s2, s9 +; GFX10-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-NEXT: s_and_b32 s7, s4, s6 +; GFX10-NEXT: s_lshr_b32 s11, s11, 1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_andn2_b32 s4, s6, s4 +; GFX10-NEXT: s_lshr_b32 s10, s7, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s7 +; GFX10-NEXT: s_lshl_b32 s7, s8, s10 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_and_b32 s10, s4, s9 +; GFX10-NEXT: s_and_b32 s2, s2, s9 +; GFX10-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, s10 +; GFX10-NEXT: s_lshr_b32 s4, s8, s4 +; GFX10-NEXT: s_and_b32 s8, s3, s9 +; GFX10-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s4, s5, s6 +; GFX10-NEXT: s_lshr_b32 s8, s8, 1 +; GFX10-NEXT: s_lshr_b32 s3, s3, 1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX10-NEXT: s_andn2_b32 s5, s6, s5 +; GFX10-NEXT: s_lshr_b32 s6, s1, 16 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, s4 +; GFX10-NEXT: s_lshl_b32 s4, s6, s7 +; GFX10-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-NEXT: s_and_b32 s7, s5, s9 +; GFX10-NEXT: s_and_b32 s3, s3, s9 +; GFX10-NEXT: s_lshr_b32 s5, s5, 16 +; GFX10-NEXT: s_lshr_b32 s3, s3, s7 +; GFX10-NEXT: s_lshr_b32 s5, s6, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog + %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) + %cast.result = bitcast <4 x i16> %result to <2 x i32> + ret <2 x i32> %cast.result +} + +define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) { +; GFX6-LABEL: v_fshl_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX6-NEXT: v_bfe_u32 v12, v12, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v4, v6, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 +; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, v7, v12 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v8, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, v8, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v9 +; GFX8-NEXT: v_or_b32_e32 v4, v8, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v6 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v8, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v5, 1 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, 1, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v11, s4, v4 +; GFX10-NEXT: v_and_b32_e32 v15, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v19, s4, v5 +; GFX10-NEXT: v_and_b32_e32 v6, s4, v7 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v11, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v15, v2 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v19, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v6, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) + %cast.result = bitcast <4 x i16> %result to <4 x half> + ret <4 x half> %cast.result +} + +define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) { +; GCN-LABEL: s_fshl_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_and_b64 s[6:7], s[4:5], 63 +; GCN-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) + ret i64 %result +} + +define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) { +; GCN-LABEL: s_fshl_i64_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshr_b32 s2, s3, 27 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 5 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5) + ret i64 %result +} + +define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) { +; GCN-LABEL: s_fshl_i64_32: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s2, s3 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32) + ret i64 %result +} + +define amdgpu_ps i64 @s_fshl_i64_48(i64 inreg %lhs, i64 inreg %rhs) { +; GCN-LABEL: s_fshl_i64_48: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s1, s0, 16 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48) + ret i64 %result +} + +define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { +; GFX6-LABEL: v_fshl_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v7, 63, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 63, v5 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) + ret i64 %result +} + +define i64 @v_fshl_i64_5(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_fshl_i64_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 5 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 27, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i64_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 27, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i64_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 27, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i64_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 5, v[0:1] +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 27, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5) + ret i64 %result +} + +define i64 @v_fshl_i64_32(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_fshl_i64_32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i64_32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i64_32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32) + ret i64 %result +} + +define i64 @v_fshl_i64_48(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_fshl_i64_48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[2:3], 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i64_48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i64_48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i64_48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[2:3] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 48) + ret i64 %result +} + +define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) { +; GFX6-LABEL: v_fshl_i64_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v1 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX6-NEXT: v_lshr_b64 v[2:3], s[0:1], v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i64_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i64_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i64_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) + %cast = bitcast i64 %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) { +; GFX6-LABEL: v_fshl_i64_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i64_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i64_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i64_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3] +; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) + %cast = bitcast i64 %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) { +; GFX6-LABEL: v_fshl_i64_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i64_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i64_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i64_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) + %cast = bitcast i64 %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) { +; GFX6-LABEL: s_fshl_v2i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v2i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX10-NEXT: s_and_b64 s[8:9], s[10:11], 63 +; GFX10-NEXT: s_andn2_b64 s[10:11], 63, s[10:11] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) + ret <2 x i64> %result +} + +define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { +; GFX6-LABEL: v_fshl_v2i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 +; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v8 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX10-NEXT: v_and_b32_e32 v15, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v19, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v13, 63, v11 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v10 +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3] +; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) + ret <2 x i64> %result +} + +define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { +; GFX6-LABEL: s_fshl_i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s10, 0x7f +; GFX6-NEXT: s_mov_b32 s11, 0 +; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_sub_i32 s9, s12, 64 +; GFX6-NEXT: s_sub_i32 s13, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s12 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[0:1], s13 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 +; GFX6-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s9, 1, 64 +; GFX6-NEXT: s_sub_i32 s14, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[4:5], 1 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[6:7], s14 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[12:13], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX6-NEXT: s_sub_i32 s14, s8, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s10, 0x7f +; GFX8-NEXT: s_mov_b32 s11, 0 +; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_sub_i32 s9, s12, 64 +; GFX8-NEXT: s_sub_i32 s13, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s12 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[0:1], s13 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 +; GFX8-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s9, 1, 64 +; GFX8-NEXT: s_sub_i32 s14, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[4:5], 1 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[6:7], s14 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[12:13], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX8-NEXT: s_sub_i32 s14, s8, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s10, 0x7f +; GFX9-NEXT: s_mov_b32 s11, 0 +; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_sub_i32 s9, s12, 64 +; GFX9-NEXT: s_sub_i32 s13, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s12 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[0:1], s13 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 +; GFX9-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s9, 1, 64 +; GFX9-NEXT: s_sub_i32 s14, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[4:5], 1 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[6:7], s14 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[12:13], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9-NEXT: s_sub_i32 s14, s8, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s10, 0x7f +; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX10-NEXT: s_sub_i32 s9, s12, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s12 +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s12 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s9, 1, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GFX10-NEXT: s_lshr_b64 s[14:15], s[6:7], 1 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[14:15], 0 +; GFX10-NEXT: s_sub_i32 s14, s8, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) + ret i128 %result +} + +define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { +; GFX6-LABEL: v_fshl_i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14 +; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v14 +; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], v14 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GFX6-NEXT: s_sub_i32 s4, 1, 64 +; GFX6-NEXT: s_sub_i32 s5, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], s5 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], 1 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v15 +; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v15 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v15 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v11, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_movk_i32 s4, 0x7f +; GFX8-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14 +; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GFX8-NEXT: s_sub_i32 s4, 1, 64 +; GFX8-NEXT: s_sub_i32 s5, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], s5, v[6:7] +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s4, v[6:7] +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v15 +; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v15 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v11, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x7f +; GFX9-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14 +; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_sub_i32 s4, 1, 64 +; GFX9-NEXT: s_sub_i32 s5, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], s5, v[6:7] +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s4, v[6:7] +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15 +; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v11, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v13, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX10-NEXT: s_movk_i32 s4, 0x7f +; GFX10-NEXT: v_mov_b32_e32 v27, v2 +; GFX10-NEXT: v_and_b32_e32 v18, s4, v8 +; GFX10-NEXT: v_mov_b32_e32 v28, v3 +; GFX10-NEXT: v_and_b32_e32 v19, s4, v9 +; GFX10-NEXT: s_sub_i32 s4, 64, 1 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[6:7] +; GFX10-NEXT: s_sub_i32 s4, 1, 64 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[15:16], s4, v[6:7] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: v_sub_nc_u32_e32 v14, 64, v18 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v14, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[27:28] +; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, v7, s4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v23, 64, v18 +; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[21:22] +; GFX10-NEXT: v_lshlrev_b64 v[12:13], v18, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v8, v[21:22] +; GFX10-NEXT: v_or_b32_e32 v14, v14, v16 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[21:22] +; GFX10-NEXT: v_cndmask_b32_e64 v23, v8, v14, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v31, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v23, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v27, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v28, s6 +; GFX10-NEXT: v_or_b32_e32 v0, v31, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v7, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) + ret i128 %result +} + +define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { +; GFX6-LABEL: v_fshl_i128_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s8, 0x7f +; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 +; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v6 +; GFX6-NEXT: s_sub_i32 s10, 1, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v8 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX6-NEXT: v_lshl_b64 v[4:5], s[0:1], v6 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v7 +; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v2 +; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[0:1], v7 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i128_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s8, 0x7f +; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v6 +; GFX8-NEXT: s_sub_i32 s10, 1, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i128_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s8, 0x7f +; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v6 +; GFX9-NEXT: s_sub_i32 s10, 1, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 1 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 1 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v7 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i128_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s8, 0x7f +; GFX10-NEXT: s_sub_i32 s14, 1, 64 +; GFX10-NEXT: v_and_b32_e32 v12, s8, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX10-NEXT: s_sub_i32 s10, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v12 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_and_b32_e32 v13, s8, v0 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v1, s[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v12, s[2:3] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], 1 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[6:7], 1 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[4:5], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[0:1] +; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[8:9] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] +; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[8:9] +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s6, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX10-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) + %cast.result = bitcast i128 %result to <4 x float> + ret <4 x float> %cast.result +} + +define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { +; GFX6-LABEL: v_fshl_i128_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_sub_i32 s5, s8, 64 +; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX6-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s1, 64, 1 +; GFX6-NEXT: s_sub_i32 s0, 1, 64 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], 1 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 +; GFX6-NEXT: s_and_b32 s0, 1, s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX6-NEXT: s_and_b32 s0, 1, s8 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] +; GFX6-NEXT: s_sub_i32 s0, s4, 64 +; GFX6-NEXT: s_sub_i32 s1, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 +; GFX6-NEXT: s_and_b32 s0, 1, s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: s_and_b32 s0, 1, s8 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX6-NEXT: s_and_b32 s0, 1, s5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i128_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_sub_i32 s5, s8, 64 +; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX8-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s1, 64, 1 +; GFX8-NEXT: s_sub_i32 s0, 1, 64 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[2:3] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_and_b32 s0, 1, s8 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] +; GFX8-NEXT: s_sub_i32 s0, s4, 64 +; GFX8-NEXT: s_sub_i32 s1, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: s_and_b32 s0, 1, s8 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: s_and_b32 s0, 1, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i128_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_sub_i32 s5, s8, 64 +; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s8 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX9-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s1, 64, 1 +; GFX9-NEXT: s_sub_i32 s0, 1, 64 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[2:3] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s5 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX9-NEXT: s_and_b32 s0, 1, s8 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, s4, 64 +; GFX9-NEXT: s_sub_i32 s1, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s5 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: s_and_b32 s0, 1, s8 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX9-NEXT: s_and_b32 s0, 1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i128_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s6, 0x7f +; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[0:1] +; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_i32 s5, s8, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s8 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 64, 1 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] +; GFX10-NEXT: s_sub_i32 s0, 1, 64 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX10-NEXT: s_sub_i32 s0, 64, s4 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] +; GFX10-NEXT: s_sub_i32 s0, s4, 64 +; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) + %cast.result = bitcast i128 %result to <4 x float> + ret <4 x float> %cast.result +} + +define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { +; GFX6-LABEL: v_fshl_i128_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_sub_i32 s6, 64, s8 +; GFX6-NEXT: s_sub_i32 s5, s8, 64 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s8 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s8 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 +; GFX6-NEXT: s_and_b32 s5, 1, s7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_sub_i32 s5, 1, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], 1 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX6-NEXT: s_sub_i32 s10, s4, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshl_i128_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_sub_i32 s6, 64, s8 +; GFX8-NEXT: s_sub_i32 s5, s8, 64 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX8-NEXT: s_and_b32 s5, 1, s7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_sub_i32 s5, 1, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], 1 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX8-NEXT: s_sub_i32 s10, s4, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshl_i128_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_sub_i32 s6, 64, s8 +; GFX9-NEXT: s_sub_i32 s5, s8, 64 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX9-NEXT: s_and_b32 s5, 1, s7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_sub_i32 s5, 1, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], 1 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX9-NEXT: s_sub_i32 s10, s4, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX9-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshl_i128_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s6, 0x7f +; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_sub_i32 s5, s8, 64 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s9, v[0:1] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s8, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s8, v[0:1] +; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[11:12], s5, v[0:1] +; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s12, 1, s6 +; GFX10-NEXT: s_sub_i32 s13, 1, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s14, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], 1 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[2:3], 1 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13 +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc_lo +; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v5, vcc_lo +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s12 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[10:11], 0 +; GFX10-NEXT: s_sub_i32 s10, s4, 64 +; GFX10-NEXT: s_sub_i32 s5, 64, s4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo +; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) + %cast.result = bitcast i128 %result to <4 x float> + ret <4 x float> %cast.result +} + +define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) { +; GFX6-LABEL: s_fshl_i128_65: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s8, 0x41 +; GFX6-NEXT: s_sub_i32 s16, s8, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s18, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s8 +; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s14, 63, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, 63 +; GFX6-NEXT: s_cmp_lt_u32 63, 64 +; GFX6-NEXT: s_mov_b32 s9, 0 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 63, 0 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_lshr_b32 s0, s5, 31 +; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 +; GFX6-NEXT: s_lshr_b32 s8, s7, 31 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i128_65: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s8, 0x41 +; GFX8-NEXT: s_sub_i32 s16, s8, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s8 +; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s14, 63, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, 63 +; GFX8-NEXT: s_cmp_lt_u32 63, 64 +; GFX8-NEXT: s_mov_b32 s9, 0 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 63, 0 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_lshr_b32 s0, s5, 31 +; GFX8-NEXT: s_mov_b32 s1, s9 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 +; GFX8-NEXT: s_lshr_b32 s8, s7, 31 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i128_65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s8, 0x41 +; GFX9-NEXT: s_sub_i32 s16, s8, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s18, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s8 +; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s14, 63, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, 63 +; GFX9-NEXT: s_cmp_lt_u32 63, 64 +; GFX9-NEXT: s_mov_b32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 63, 0 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_lshr_b32 s0, s5, 31 +; GFX9-NEXT: s_mov_b32 s1, s9 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 +; GFX9-NEXT: s_lshr_b32 s8, s7, 31 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i128_65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s12, 0x41 +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s12 +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s12 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], s12 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s14, 63, 64 +; GFX10-NEXT: s_sub_i32 s0, 64, 63 +; GFX10-NEXT: s_cmp_lt_u32 63, 64 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 63, 0 +; GFX10-NEXT: s_mov_b32 s9, s1 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[6:7], s0 +; GFX10-NEXT: s_lshr_b32 s8, s5, 31 +; GFX10-NEXT: s_lshr_b32 s0, s7, 31 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[10:11], s[4:5] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) + ret i128 %result +} + +define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { +; GFX6-LABEL: v_fshl_i128_65: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0x41 +; GFX6-NEXT: s_sub_i32 s6, 64, s4 +; GFX6-NEXT: s_sub_i32 s5, s4, 64 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], s6 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], s4 +; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], s4 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: s_and_b32 s4, 1, s8 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_sub_i32 s4, 63, 64 +; GFX6-NEXT: s_sub_i32 s5, 64, 63 +; GFX6-NEXT: s_cmp_lt_u32 63, 64 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 63, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX6-NEXT: v_lshl_b64 v[0:1], v[6:7], s5 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 31, v5 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v11, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_i128_65: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_movk_i32 s4, 0x41 +; GFX8-NEXT: s_sub_i32 s6, 64, s4 +; GFX8-NEXT: s_sub_i32 s5, s4, 64 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s6, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], s4, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[12:13], s4, v[0:1] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: s_and_b32 s4, 1, s8 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_sub_i32 s4, 63, 64 +; GFX8-NEXT: s_sub_i32 s5, 64, 63 +; GFX8-NEXT: s_cmp_lt_u32 63, 64 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 63, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[6:7] +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 31, v5 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v11, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_i128_65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x41 +; GFX9-NEXT: s_sub_i32 s6, 64, s4 +; GFX9-NEXT: s_sub_i32 s5, s4, 64 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s6, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], s4, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], s4, v[0:1] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: s_and_b32 s4, 1, s8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_sub_i32 s4, 63, 64 +; GFX9-NEXT: s_sub_i32 s5, 64, 63 +; GFX9-NEXT: s_cmp_lt_u32 63, 64 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 63, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[6:7] +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 31, v5 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], s4, v[6:7] +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v11, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_i128_65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_movk_i32 s4, 0x41 +; GFX10-NEXT: v_lshrrev_b32_e32 v19, 31, v5 +; GFX10-NEXT: s_sub_i32 s5, 64, s4 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], s4, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] +; GFX10-NEXT: s_sub_i32 s5, s4, 64 +; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: v_lshlrev_b64 v[12:13], s4, v[0:1] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_sub_i32 s5, 64, 63 +; GFX10-NEXT: v_or_b32_e32 v15, v9, v11 +; GFX10-NEXT: v_or_b32_e32 v14, v8, v10 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[6:7] +; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s7, 1, s4 +; GFX10-NEXT: s_sub_i32 s4, 63, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v27, 0, v13, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: v_lshrrev_b64 v[23:24], s4, v[6:7] +; GFX10-NEXT: s_cmp_lt_u32 63, 64 +; GFX10-NEXT: v_or_b32_e32 v6, v19, v8 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 63, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s7 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v23, v6, s4 +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v24, v9, s4 +; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v0, v2, s6 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 31, v7 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v1, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v10, v3, s6 +; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v27, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v19, v6 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65) + ret i128 %result +} + +define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { +; GFX6-LABEL: s_fshl_v2i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s18, 0x7f +; GFX6-NEXT: s_mov_b32 s19, 0 +; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX6-NEXT: s_sub_i32 s17, s22, 64 +; GFX6-NEXT: s_sub_i32 s23, 64, s22 +; GFX6-NEXT: s_cmp_lt_u32 s22, 64 +; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s22, 0 +; GFX6-NEXT: s_cselect_b32 s29, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 +; GFX6-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s28, 1, 64 +; GFX6-NEXT: s_sub_i32 s29, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s30, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[22:23], s[8:9], 1 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[10:11], s29 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], 1 +; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[26:27] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s28 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s30, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX6-NEXT: s_sub_i32 s26, s16, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s16 +; GFX6-NEXT: s_cmp_lt_u32 s16, 64 +; GFX6-NEXT: s_cselect_b32 s27, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_cselect_b32 s30, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s26 +; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s30, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX6-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] +; GFX6-NEXT: s_sub_i32 s11, s8, 64 +; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s21, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 +; GFX6-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX6-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[14:15], s29 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], 1 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[18:19] +; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s28 +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX6-NEXT: s_sub_i32 s18, s10, 64 +; GFX6-NEXT: s_sub_i32 s14, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_cselect_b32 s19, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[4:5], s14 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s18 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 +; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_v2i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s18, 0x7f +; GFX8-NEXT: s_mov_b32 s19, 0 +; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX8-NEXT: s_sub_i32 s17, s22, 64 +; GFX8-NEXT: s_sub_i32 s23, 64, s22 +; GFX8-NEXT: s_cmp_lt_u32 s22, 64 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s22, 0 +; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 +; GFX8-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s28, 1, 64 +; GFX8-NEXT: s_sub_i32 s29, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[22:23], s[8:9], 1 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[10:11], s29 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], 1 +; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[26:27] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s28 +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s30, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX8-NEXT: s_sub_i32 s26, s16, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s16 +; GFX8-NEXT: s_cmp_lt_u32 s16, 64 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_cselect_b32 s30, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s26 +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s30, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX8-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] +; GFX8-NEXT: s_sub_i32 s11, s8, 64 +; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 +; GFX8-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX8-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[14:15], s29 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], 1 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[18:19] +; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s28 +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX8-NEXT: s_sub_i32 s18, s10, 64 +; GFX8-NEXT: s_sub_i32 s14, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[4:5], s14 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s18 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 +; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_v2i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s18, 0x7f +; GFX9-NEXT: s_mov_b32 s19, 0 +; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX9-NEXT: s_sub_i32 s17, s22, 64 +; GFX9-NEXT: s_sub_i32 s23, 64, s22 +; GFX9-NEXT: s_cmp_lt_u32 s22, 64 +; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s22, 0 +; GFX9-NEXT: s_cselect_b32 s29, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s22 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s23 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 +; GFX9-NEXT: s_or_b64 s[22:23], s[26:27], s[22:23] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s28, 1, 64 +; GFX9-NEXT: s_sub_i32 s29, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[8:9], 1 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[10:11], s29 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], 1 +; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[26:27] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s28 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s30, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 +; GFX9-NEXT: s_sub_i32 s26, s16, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s16 +; GFX9-NEXT: s_cmp_lt_u32 s16, 64 +; GFX9-NEXT: s_cselect_b32 s27, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_cselect_b32 s30, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s22 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[22:23] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s26 +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[16:17], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s30, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX9-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] +; GFX9-NEXT: s_sub_i32 s11, s8, 64 +; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 +; GFX9-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 +; GFX9-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[14:15], s29 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 1 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[18:19] +; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s28 +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX9-NEXT: s_sub_i32 s18, s10, 64 +; GFX9-NEXT: s_sub_i32 s14, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_cselect_b32 s19, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[4:5], s14 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s18 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 +; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s18, 0x7f +; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_mov_b32 s30, s0 +; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX10-NEXT: s_sub_i32 s17, s22, 64 +; GFX10-NEXT: s_sub_i32 s23, 64, s22 +; GFX10-NEXT: s_cmp_lt_u32 s22, 64 +; GFX10-NEXT: s_mov_b32 s31, s1 +; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s22, 0 +; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[30:31], s23 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s22 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[30:31], s22 +; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[30:31], s17 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s29, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s28, 1, 64 +; GFX10-NEXT: s_sub_i32 s29, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s30, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s29 +; GFX10-NEXT: s_lshr_b64 s[26:27], s[10:11], 1 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s28 +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10-NEXT: s_cselect_b64 s[46:47], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[26:27], 0 +; GFX10-NEXT: s_sub_i32 s26, s16, 64 +; GFX10-NEXT: s_sub_i32 s17, 64, s16 +; GFX10-NEXT: s_cmp_lt_u32 s16, 64 +; GFX10-NEXT: s_cselect_b32 s27, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s16, 0 +; GFX10-NEXT: s_cselect_b32 s30, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[46:47], s16 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s26 +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s30, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[46:47], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX10-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_sub_i32 s11, s8, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 +; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s29 +; GFX10-NEXT: s_lshr_b64 s[18:19], s[14:15], 1 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s28 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cselect_b64 s[12:13], s[18:19], 0 +; GFX10-NEXT: s_sub_i32 s18, s10, 64 +; GFX10-NEXT: s_sub_i32 s11, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 +; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 +; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] +; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX10-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) + ret <2 x i128> %result +} + +define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) { +; GFX6-LABEL: v_fshl_v2i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v23 +; GFX6-NEXT: s_sub_i32 s7, 64, 1 +; GFX6-NEXT: s_sub_i32 s8, 1, 64 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: v_or_b32_e32 v24, v17, v21 +; GFX6-NEXT: v_or_b32_e32 v25, v18, v22 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[8:9], 1 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[10:11], s7 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: v_or_b32_e32 v19, v17, v21 +; GFX6-NEXT: v_or_b32_e32 v21, v18, v22 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s4, 1, s4 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[10:11], s8 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s5 +; GFX6-NEXT: v_lshr_b64 v[10:11], v[10:11], 1 +; GFX6-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v18, v18, v21, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX6-NEXT: v_and_b32_e32 v21, s6, v16 +; GFX6-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v21 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[10:11], v16 +; GFX6-NEXT: v_lshr_b64 v[18:19], v[8:9], v21 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX6-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v23 +; GFX6-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v23 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v24, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v16, v17, v25, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX6-NEXT: v_subrev_i32_e64 v0, s[4:5], 64, v21 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], v0 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v21 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v0, v22, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX6-NEXT: v_or_b32_e32 v3, v16, v9 +; GFX6-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX6-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v16 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v16 +; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v16 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v18 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], s7 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s6, 1, s4 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], s8 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 +; GFX6-NEXT: s_and_b32 s5, 1, s5 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[14:15], 1 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s4, 1, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v17 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v17 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 +; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 64, v17 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v17 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v12 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v18, v4 +; GFX6-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX6-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshl_v2i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX8-NEXT: s_sub_i32 s7, 64, 1 +; GFX8-NEXT: s_sub_i32 s8, 1, 64 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: v_or_b32_e32 v24, v17, v21 +; GFX8-NEXT: v_or_b32_e32 v25, v18, v22 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], 1, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], s7, v[10:11] +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: v_or_b32_e32 v19, v17, v21 +; GFX8-NEXT: v_or_b32_e32 v21, v18, v22 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], s8, v[10:11] +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s5 +; GFX8-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX8-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v21, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX8-NEXT: v_and_b32_e32 v21, s6, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v21 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[18:19], v21, v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX8-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v23 +; GFX8-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v24, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v25, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX8-NEXT: v_subrev_u32_e64 v0, s[4:5], 64, v21 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v21, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v0, v22, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX8-NEXT: v_or_b32_e32 v3, v16, v9 +; GFX8-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX8-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v16 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s7, v[14:15] +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s6, 1, s4 +; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[14:15] +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 +; GFX8-NEXT: s_and_b32 s5, 1, s5 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], 1, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v17 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] +; GFX8-NEXT: v_subrev_u32_e32 v12, vcc, 64, v17 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v18, v4 +; GFX8-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshl_v2i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: v_and_b32_e32 v23, s6, v16 +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v23 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[2:3] +; GFX9-NEXT: s_sub_i32 s7, 64, 1 +; GFX9-NEXT: s_sub_i32 s8, 1, 64 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: v_or_b32_e32 v24, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v25, v18, v22 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 1, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], s7, v[10:11] +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_or_b32_e32 v19, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v21, v18, v22 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], s8, v[10:11] +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s5 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX9-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v21, s6, v16 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v18, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GFX9-NEXT: v_sub_u32_e32 v16, 64, v21 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], v21, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX9-NEXT: v_or_b32_e32 v18, v18, v16 +; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v23 +; GFX9-NEXT: v_or_b32_e32 v19, v19, v17 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v16, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v23, v[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v25, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v17, v0, v2, s[4:5] +; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v21 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v3, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v0, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v21 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v21, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, v1, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v0, v22, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v18, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v17, v8 +; GFX9-NEXT: v_or_b32_e32 v3, v16, v9 +; GFX9-NEXT: v_and_b32_e32 v16, s6, v20 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX9-NEXT: v_and_b32_e32 v17, s6, v8 +; GFX9-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v16, v[6:7] +; GFX9-NEXT: v_subrev_u32_e32 v18, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v16, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v18, v[4:5] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s7, v[14:15] +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s6, 1, s4 +; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[14:15] +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 +; GFX9-NEXT: s_and_b32 s5, 1, s5 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], 1, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v17 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] +; GFX9-NEXT: v_subrev_u32_e32 v12, 64, v17 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v17, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v12, v[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v18, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v19, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v16, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshl_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 +; GFX10-NEXT: s_movk_i32 s7, 0x7f +; GFX10-NEXT: s_sub_i32 s8, 64, 1 +; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: v_lshlrev_b64 v[18:19], s8, v[10:11] +; GFX10-NEXT: v_and_b32_e32 v28, s7, v17 +; GFX10-NEXT: v_lshrrev_b64 v[16:17], 1, v[8:9] +; GFX10-NEXT: s_sub_i32 s9, 1, 64 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[21:22], s9, v[10:11] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: v_mov_b32_e32 v29, v2 +; GFX10-NEXT: v_mov_b32_e32 v30, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v23, 64, v27 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v16, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v22, v22, v17, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v23, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[29:30] +; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v34, v21, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v35, v22, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v10, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v11, s4 +; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 +; GFX10-NEXT: v_subrev_nc_u32_e32 v31, 64, v27 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[34:35] +; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v31, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 +; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 +; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v17, v24, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v22, vcc_lo +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v34, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v35, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, v1, s4 +; GFX10-NEXT: v_xor_b32_e32 v16, -1, v20 +; GFX10-NEXT: v_or_b32_e32 v0, v21, v8 +; GFX10-NEXT: v_or_b32_e32 v1, v11, v9 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[10:11], s8, v[14:15] +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v27 +; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshrrev_b64 v[16:17], s9, v[14:15] +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: v_and_b32_e32 v24, s7, v20 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v22, v19, v30, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v19, v16, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v31, v17, v11, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v29, s6 +; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v24 +; GFX10-NEXT: v_lshlrev_b64 v[14:15], v24, v[6:7] +; GFX10-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v8, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v9, s4 +; GFX10-NEXT: v_sub_nc_u32_e32 v31, 64, v27 +; GFX10-NEXT: v_lshrrev_b64 v[35:36], v18, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v27 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v27, v[12:13] +; GFX10-NEXT: v_lshlrev_b64 v[20:21], v31, v[8:9] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 +; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v5, v36, v15 +; GFX10-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[8:9] +; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v16, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v27 +; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v31, v3, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[3:4], v27, v[8:9] +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v24 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v18, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v27 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v17, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v14, v5, v7, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, v4, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v31, v6, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v10, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v13, s5 +; GFX10-NEXT: v_or_b32_e32 v3, v22, v23 +; GFX10-NEXT: v_or_b32_e32 v7, v14, v11 +; GFX10-NEXT: v_or_b32_e32 v4, v15, v5 +; GFX10-NEXT: v_or_b32_e32 v6, v19, v10 +; GFX10-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) + ret <2 x i128> %result +} + +declare i7 @llvm.fshl.i7(i7, i7, i7) #0 +declare i8 @llvm.fshl.i8(i8, i8, i8) #0 +declare <2 x i8> @llvm.fshl.v2i8(<2 x i8>, <2 x i8>, <2 x i8>) #0 +declare <4 x i8> @llvm.fshl.v4i8(<4 x i8>, <4 x i8>, <4 x i8>) #0 + +declare i16 @llvm.fshl.i16(i16, i16, i16) #0 +declare <2 x i16> @llvm.fshl.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.fshl.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.fshl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #0 +declare <5 x i16> @llvm.fshl.v5i16(<5 x i16>, <5 x i16>, <5 x i16>) #0 +declare <6 x i16> @llvm.fshl.v6i16(<6 x i16>, <6 x i16>, <6 x i16>) #0 +declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) #0 + +declare i24 @llvm.fshl.i24(i24, i24, i24) #0 +declare <2 x i24> @llvm.fshl.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) #0 + +declare i32 @llvm.fshl.i32(i32, i32, i32) #0 +declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #0 +declare <3 x i32> @llvm.fshl.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) #0 +declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #0 +declare <5 x i32> @llvm.fshl.v5i32(<5 x i32>, <5 x i32>, <5 x i32>) #0 +declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) #0 + +declare i48 @llvm.fshl.i48(i48, i48, i48) #0 + +declare i64 @llvm.fshl.i64(i64, i64, i64) #0 +declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) #0 + +declare i128 @llvm.fshl.i128(i128, i128, i128) #0 +declare <2 x i128> @llvm.fshl.v2i128(<2 x i128>, <2 x i128>, <2 x i128>) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -0,0 +1,7572 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefixes=GCN,GFX6 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GCN,GFX10 %s + +define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { +; GFX6-LABEL: s_fshr_i7: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_sub_i32 s3, 0, 7 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_movk_i32 s3, 0x7f +; GFX6-NEXT: s_and_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_lshr_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i7: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_sub_i32 s3, 0, 7 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX8-NEXT: s_movk_i32 s3, 0x7f +; GFX8-NEXT: s_and_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i7: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s3, 0, 7 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: s_movk_i32 s3, 0x7f +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 +; GFX10-NEXT: s_sub_i32 s3, 0, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: s_movk_i32 s3, 0x7f +; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u16_e64 v1, 6, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) + ret i7 %result +} + +define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { +; GFX6-LABEL: v_fshr_i7: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX6-NEXT: s_sub_i32 s4, 0, 7 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i7: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX8-NEXT: s_sub_i32 s4, 0, 7 +; GFX8-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0x7f +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_i32 s4, 0, 7 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x7f +; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 +; GFX10-NEXT: s_sub_i32 s4, 0, 7 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, 7 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f +; GFX10-NEXT: v_sub_nc_u16_e64 v4, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v7, v4, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v7, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) + ret i7 %result +} + +define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { +; GFX6-LABEL: s_fshr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s3, s2, 7 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_andn2_b32 s2, 7, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, s3 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s3, s2, 7 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s3, s2, 7 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s3, s2, 7 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt) + ret i8 %result +} + +define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { +; GFX6-LABEL: v_fshr_i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt) + ret i8 %result +} + +define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) { +; GFX6-LABEL: s_fshr_i8_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_lshl_b32 s0, s0, 4 +; GFX6-NEXT: s_lshr_b32 s1, s1, 4 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i8_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshr_b32 s1, s1, 4 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i8_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshr_b32 s1, s1, 4 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i8_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s1, 4 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4) + ret i8 %result +} + +define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_fshr_i8_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 4, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i8_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 4 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i8_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 4 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 4, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i8_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 4, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 4, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4) + ret i8 %result +} + +define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) { +; GFX6-LABEL: s_fshr_i8_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_lshl_b32 s0, s0, 3 +; GFX6-NEXT: s_lshr_b32 s1, s1, 5 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i8_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshr_b32 s1, s1, 5 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i8_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: s_lshr_b32 s1, s1, 5 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i8_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, 3 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshr_b32 s1, s1, 5 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5) + ret i8 %result +} + +define i8 @v_fshr_i8_5(i8 %lhs, i8 %rhs) { +; GFX6-LABEL: v_fshr_i8_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i8_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, 5 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 3, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i8_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i8_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 3, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 5, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5) + ret i8 %result +} + +define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 inreg %amt.arg) { +; GFX6-LABEL: s_fshr_v2i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s7, 0xff +; GFX6-NEXT: s_lshr_b32 s3, s0, 8 +; GFX6-NEXT: s_lshr_b32 s4, s1, 8 +; GFX6-NEXT: s_lshr_b32 s5, s2, 8 +; GFX6-NEXT: s_and_b32 s6, s2, 7 +; GFX6-NEXT: s_and_b32 s1, s1, s7 +; GFX6-NEXT: s_andn2_b32 s2, 7, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, s6 +; GFX6-NEXT: s_andn2_b32 s2, 7, s5 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s1, s5, 7 +; GFX6-NEXT: s_and_b32 s3, s4, s7 +; GFX6-NEXT: s_lshr_b32 s1, s3, s1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s1, s7 +; GFX6-NEXT: s_and_b32 s0, s0, s7 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v2i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s5, s2, 8 +; GFX8-NEXT: s_and_b32 s6, s2, 7 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s1, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_and_b32 s4, s4, s2 +; GFX8-NEXT: s_lshr_b32 s1, s1, s6 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s5, 7 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_andn2_b32 s5, 7, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshl_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s1, s4, s1 +; GFX8-NEXT: s_or_b32 s1, s3, s1 +; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v2i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s5, s2, 8 +; GFX9-NEXT: s_and_b32 s6, s2, 7 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s1, 8 +; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_and_b32 s4, s4, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s6 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s5, 7 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_andn2_b32 s5, 7, s5 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_lshl_b32 s3, s3, s5 +; GFX9-NEXT: s_lshr_b32 s1, s4, s1 +; GFX9-NEXT: s_or_b32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v2i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_movk_i32 s7, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_and_b32 s4, s4, s7 +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_and_b32 s6, s2, 7 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s5, 7 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_andn2_b32 s5, 7, s5 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshr_b32 s2, s4, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s6 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_and_b32 s1, s2, s7 +; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX10-NEXT: s_and_b32 s0, s0, s7 +; GFX10-NEXT: s_lshl_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %lhs = bitcast i16 %lhs.arg to <2 x i8> + %rhs = bitcast i16 %rhs.arg to <2 x i8> + %amt = bitcast i16 %amt.arg to <2 x i8> + %result = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt) + %cast.result = bitcast <2 x i8> %result to i16 + ret i16 %cast.result +} + +define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { +; GFX6-LABEL: v_fshr_v2i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX6-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v3 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_lshlrev_b16_e64 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v3, v3, v5 +; GFX10-NEXT: v_lshlrev_b16_e64 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v7, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: s_setpc_b64 s[30:31] + %lhs = bitcast i16 %lhs.arg to <2 x i8> + %rhs = bitcast i16 %rhs.arg to <2 x i8> + %amt = bitcast i16 %amt.arg to <2 x i8> + %result = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> %lhs, <2 x i8> %rhs, <2 x i8> %amt) + %cast.result = bitcast <2 x i8> %result to i16 + ret i16 %cast.result +} + +define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 inreg %amt.arg) { +; GFX6-LABEL: s_fshr_v4i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s13, 0xff +; GFX6-NEXT: s_lshr_b32 s3, s0, 8 +; GFX6-NEXT: s_lshr_b32 s4, s0, 16 +; GFX6-NEXT: s_lshr_b32 s5, s0, 24 +; GFX6-NEXT: s_lshr_b32 s6, s1, 8 +; GFX6-NEXT: s_lshr_b32 s7, s1, 16 +; GFX6-NEXT: s_lshr_b32 s8, s1, 24 +; GFX6-NEXT: s_lshr_b32 s9, s2, 8 +; GFX6-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NEXT: s_lshr_b32 s11, s2, 24 +; GFX6-NEXT: s_and_b32 s12, s2, 7 +; GFX6-NEXT: s_and_b32 s1, s1, s13 +; GFX6-NEXT: s_andn2_b32 s2, 7, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_lshr_b32 s1, s1, s12 +; GFX6-NEXT: s_andn2_b32 s2, 7, s9 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s1, s9, 7 +; GFX6-NEXT: s_and_b32 s3, s6, s13 +; GFX6-NEXT: s_lshr_b32 s1, s3, s1 +; GFX6-NEXT: s_andn2_b32 s3, 7, s10 +; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s10, 7 +; GFX6-NEXT: s_and_b32 s4, s7, s13 +; GFX6-NEXT: s_lshr_b32 s2, s4, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s13 +; GFX6-NEXT: s_or_b32 s2, s3, s2 +; GFX6-NEXT: s_and_b32 s3, s11, 7 +; GFX6-NEXT: s_andn2_b32 s4, 7, s11 +; GFX6-NEXT: s_lshl_b32 s5, s5, 1 +; GFX6-NEXT: s_and_b32 s0, s0, s13 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s2, s13 +; GFX6-NEXT: s_lshl_b32 s4, s5, s4 +; GFX6-NEXT: s_lshr_b32 s3, s8, s3 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s3, s13 +; GFX6-NEXT: s_lshl_b32 s1, s1, 24 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v4i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s13, 0xff +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_lshr_b32 s6, s1, 8 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_lshr_b32 s9, s2, 8 +; GFX8-NEXT: s_lshr_b32 s10, s2, 16 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_and_b32 s12, s2, 7 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_andn2_b32 s2, 7, s9 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 +; GFX8-NEXT: s_lshr_b32 s1, s1, s12 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s9, 7 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: s_andn2_b32 s3, 7, s10 +; GFX8-NEXT: s_lshl_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s3, s4, s3 +; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s10, 7 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s4, s2 +; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_or_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s11, 7 +; GFX8-NEXT: s_andn2_b32 s4, 7, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, 1 +; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshr_b32 s3, s8, s3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s13, 0xff +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshr_b32 s6, s1, 8 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NEXT: s_lshr_b32 s8, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: s_lshr_b32 s10, s2, 16 +; GFX9-NEXT: s_lshr_b32 s11, s2, 24 +; GFX9-NEXT: s_and_b32 s12, s2, 7 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_andn2_b32 s2, 7, s9 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_lshr_b32 s1, s1, s12 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s9, 7 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s3, s1 +; GFX9-NEXT: s_andn2_b32 s3, 7, s10 +; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s10, 7 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_lshr_b32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_or_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s11, 7 +; GFX9-NEXT: s_andn2_b32 s4, 7, s11 +; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_lshl_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s3, s8, s3 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s3, s4, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v4i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s6, s1, 8 +; GFX10-NEXT: s_movk_i32 s13, 0xff +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s6, s6, s13 +; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 7 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s2, s9, 7 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_lshr_b32 s1, s1, s12 +; GFX10-NEXT: s_lshr_b32 s2, s6, s2 +; GFX10-NEXT: s_and_b32 s6, s7, s13 +; GFX10-NEXT: s_lshl_b32 s3, s3, s9 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_or_b32 s1, s3, s2 +; GFX10-NEXT: s_and_b32 s2, s10, 7 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_andn2_b32 s3, 7, s10 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_lshr_b32 s2, s6, s2 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: s_andn2_b32 s4, 7, s11 +; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: s_and_b32 s6, s11, 7 +; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_lshl_b32 s4, s5, s4 +; GFX10-NEXT: s_lshr_b32 s5, s8, s6 +; GFX10-NEXT: s_and_b32 s0, s0, s13 +; GFX10-NEXT: s_lshl_b32 s1, s1, 8 +; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s2, s13 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s3, s13 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %lhs = bitcast i32 %lhs.arg to <4 x i8> + %rhs = bitcast i32 %rhs.arg to <4 x i8> + %amt = bitcast i32 %amt.arg to <4 x i8> + %result = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt) + %cast.result = bitcast <4 x i8> %result to i32 + ret i32 %cast.result +} + +define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { +; GFX6-LABEL: v_fshr_v4i8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX6-NEXT: v_and_b32_e32 v12, 7, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v12, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 7, v9 +; GFX6-NEXT: v_xor_b32_e32 v9, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v1, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v10 +; GFX6-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xff +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v9, v3 +; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 7, v10 +; GFX6-NEXT: v_and_b32_e32 v6, v7, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, v3, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v11 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 7, v11 +; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v3, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, v6, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, v4, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v4i8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v9, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v9 +; GFX8-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, 0xff +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v5, v8 +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v9 +; GFX9-NEXT: v_lshrrev_b16_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v8 +; GFX9-NEXT: v_and_b32_e32 v8, 7, v5 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v6 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 7, v6 +; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xff +; GFX9-NEXT: v_lshlrev_b16_e32 v5, v5, v8 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_or_b32 v1, v2, s4, v1 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v4i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX10-NEXT: v_and_b32_e32 v15, 7, v8 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v14, 7, v11 +; GFX10-NEXT: v_lshlrev_b16_e64 v3, 1, v3 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v15, v0 +; GFX10-NEXT: v_mov_b32_e32 v15, 0xff +; GFX10-NEXT: v_lshlrev_b16_e64 v3, v14, v3 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 +; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshlrev_b16_e64 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v15, 7, v14 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, 1, v5 +; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_lshrrev_b16_e64 v6, v6, v7 +; GFX10-NEXT: v_lshlrev_b16_e64 v4, v11, v4 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v10, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v5, v15, v5 +; GFX10-NEXT: v_lshrrev_b16_e64 v7, v12, v9 +; GFX10-NEXT: v_lshrrev_b16_e64 v2, v2, v8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %lhs = bitcast i32 %lhs.arg to <4 x i8> + %rhs = bitcast i32 %rhs.arg to <4 x i8> + %amt = bitcast i32 %amt.arg to <4 x i8> + %result = call <4 x i8> @llvm.fshr.v4i8(<4 x i8> %lhs, <4 x i8> %rhs, <4 x i8> %amt) + %cast.result = bitcast <4 x i8> %result to i32 + ret i32 %cast.result +} + +define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) { +; GFX6-LABEL: s_fshr_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_sub_i32 s3, 0, 24 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX6-NEXT: s_mov_b32 s3, 0xffffff +; GFX6-NEXT: s_and_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_lshr_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_sub_i32 s3, 0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX8-NEXT: s_mov_b32 s3, 0xffffff +; GFX8-NEXT: s_and_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX8-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s3, 0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: s_mov_b32 s3, 0xffffff +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v1, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX10-NEXT: s_sub_i32 s3, 0, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 +; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s1 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) + ret i24 %result +} + +define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { +; GFX6-LABEL: v_fshr_i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX6-NEXT: s_sub_i32 s4, 0, 24 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX8-NEXT: s_sub_i32 s4, 0, 24 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX8-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_i32 s4, 0, 24 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v3, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 +; GFX10-NEXT: s_sub_i32 s4, 0, 24 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i24 @llvm.fshr.i24(i24 %lhs, i24 %rhs, i24 %amt) + ret i24 %result +} + +define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) { +; GFX6-LABEL: s_fshr_v2i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s10, 0xff +; GFX6-NEXT: s_lshr_b32 s9, s1, 8 +; GFX6-NEXT: s_and_b32 s1, s1, s10 +; GFX6-NEXT: s_lshr_b32 s6, s0, 8 +; GFX6-NEXT: s_lshr_b32 s8, s0, 24 +; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_or_b32 s1, s8, s1 +; GFX6-NEXT: s_and_b32 s6, s6, s10 +; GFX6-NEXT: s_lshr_b32 s8, s2, 8 +; GFX6-NEXT: s_and_b32 s8, s8, s10 +; GFX6-NEXT: s_lshr_b32 s7, s0, 16 +; GFX6-NEXT: s_and_b32 s0, s0, s10 +; GFX6-NEXT: s_lshl_b32 s6, s6, 8 +; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s6, s7, s10 +; GFX6-NEXT: s_and_b32 s7, s9, s10 +; GFX6-NEXT: s_lshr_b32 s9, s2, 16 +; GFX6-NEXT: s_lshr_b32 s11, s2, 24 +; GFX6-NEXT: s_and_b32 s2, s2, s10 +; GFX6-NEXT: s_lshl_b32 s8, s8, 8 +; GFX6-NEXT: s_or_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s8, s9, s10 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_lshr_b32 s12, s3, 8 +; GFX6-NEXT: s_and_b32 s3, s3, s10 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_lshl_b32 s3, s3, 8 +; GFX6-NEXT: s_or_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s8, s12, s10 +; GFX6-NEXT: s_or_b32 s3, s11, s3 +; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_or_b32 s3, s3, s8 +; GFX6-NEXT: s_lshr_b32 s8, s4, 8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s8, s8, s10 +; GFX6-NEXT: s_lshr_b32 s9, s4, 16 +; GFX6-NEXT: s_lshr_b32 s11, s4, 24 +; GFX6-NEXT: s_and_b32 s4, s4, s10 +; GFX6-NEXT: s_lshl_b32 s8, s8, 8 +; GFX6-NEXT: s_or_b32 s4, s4, s8 +; GFX6-NEXT: s_and_b32 s8, s9, s10 +; GFX6-NEXT: s_sub_i32 s9, 0, 24 +; GFX6-NEXT: v_mul_lo_u32 v1, s9, v0 +; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: s_or_b32 s4, s4, s8 +; GFX6-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NEXT: s_and_b32 s5, s5, s10 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX6-NEXT: s_lshl_b32 s5, s5, 8 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX6-NEXT: s_and_b32 s8, s12, s10 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 +; GFX6-NEXT: s_or_b32 s5, s11, s5 +; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX6-NEXT: s_lshl_b32 s8, s8, 16 +; GFX6-NEXT: s_or_b32 s5, s5, s8 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_mov_b32 s8, 0xffffff +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX6-NEXT: s_lshl_b32 s4, s6, 17 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: s_or_b32 s0, s4, s0 +; GFX6-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 24, v1 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 23, v1 +; GFX6-NEXT: s_lshl_b32 s0, s7, 17 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX6-NEXT: v_and_b32_e32 v2, s10, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, s10, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s10, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s10, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s10, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, s10, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: v_readfirstlane_b32 s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v2i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s10, 0xff +; GFX8-NEXT: s_lshr_b32 s9, s1, 8 +; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_lshr_b32 s6, s0, 8 +; GFX8-NEXT: s_lshr_b32 s8, s0, 24 +; GFX8-NEXT: s_lshl_b32 s1, s1, s11 +; GFX8-NEXT: s_or_b32 s1, s8, s1 +; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_lshr_b32 s8, s2, 8 +; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_lshr_b32 s7, s0, 16 +; GFX8-NEXT: s_and_b32 s0, s0, s10 +; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s7, s9, s10 +; GFX8-NEXT: s_lshr_b32 s9, s2, 16 +; GFX8-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, s10 +; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_or_b32 s2, s2, s8 +; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_lshr_b32 s13, s3, 8 +; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: s_lshl_b32 s3, s3, s11 +; GFX8-NEXT: s_or_b32 s2, s2, s8 +; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: s_or_b32 s3, s12, s3 +; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s8 +; GFX8-NEXT: s_lshr_b32 s8, s4, 8 +; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_lshr_b32 s9, s4, 16 +; GFX8-NEXT: s_lshr_b32 s12, s4, 24 +; GFX8-NEXT: s_and_b32 s4, s4, s10 +; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_sub_i32 s9, 0, 24 +; GFX8-NEXT: v_mul_lo_u32 v1, s9, v0 +; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: s_or_b32 s4, s4, s8 +; GFX8-NEXT: s_lshr_b32 s13, s5, 8 +; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_mul_lo_u32 v2, s9, v1 +; GFX8-NEXT: s_or_b32 s5, s12, s5 +; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX8-NEXT: s_lshl_b32 s8, s8, 16 +; GFX8-NEXT: s_or_b32 s5, s5, s8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; GFX8-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX8-NEXT: s_mov_b32 s8, 0xffffff +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v0 +; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: s_lshl_b32 s4, s6, 17 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s5, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 +; GFX8-NEXT: s_lshl_b32 s0, s7, 17 +; GFX8-NEXT: s_lshl_b32 s1, s1, 1 +; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s3 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, s10, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s11, 0xff +; GFX9-NEXT: s_lshr_b32 s10, s1, 8 +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s1, s1, s11 +; GFX9-NEXT: s_lshr_b32 s7, s0, 8 +; GFX9-NEXT: s_lshr_b32 s9, s0, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_or_b32 s1, s9, s1 +; GFX9-NEXT: s_and_b32 s7, s7, s11 +; GFX9-NEXT: s_lshr_b32 s9, s2, 8 +; GFX9-NEXT: s_and_b32 s9, s9, s11 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s11 +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_or_b32 s0, s0, s7 +; GFX9-NEXT: s_and_b32 s7, s8, s11 +; GFX9-NEXT: s_and_b32 s8, s10, s11 +; GFX9-NEXT: s_lshr_b32 s10, s2, 16 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, s11 +; GFX9-NEXT: s_lshl_b32 s9, s9, s12 +; GFX9-NEXT: s_or_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s9, s10, s11 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_lshr_b32 s14, s3, 8 +; GFX9-NEXT: s_and_b32 s3, s3, s11 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_or_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s9, s14, s11 +; GFX9-NEXT: s_or_b32 s3, s13, s3 +; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s9 +; GFX9-NEXT: s_lshr_b32 s9, s4, 8 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s9, s9, s11 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: s_and_b32 s4, s4, s11 +; GFX9-NEXT: s_lshl_b32 s9, s9, s12 +; GFX9-NEXT: s_or_b32 s4, s4, s9 +; GFX9-NEXT: s_and_b32 s9, s10, s11 +; GFX9-NEXT: s_sub_i32 s10, 0, 24 +; GFX9-NEXT: v_mul_lo_u32 v1, s10, v0 +; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX9-NEXT: s_or_b32 s4, s4, s9 +; GFX9-NEXT: s_lshr_b32 s14, s5, 8 +; GFX9-NEXT: s_and_b32 s5, s5, s11 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX9-NEXT: s_and_b32 s9, s14, s11 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: s_or_b32 s5, s13, s5 +; GFX9-NEXT: s_bfe_u32 s9, s9, 0x100000 +; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX9-NEXT: s_lshl_b32 s9, s9, 16 +; GFX9-NEXT: s_or_b32 s5, s5, s9 +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX9-NEXT: s_mov_b32 s9, 0xffffff +; GFX9-NEXT: v_sub_u32_e32 v3, 23, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX9-NEXT: s_lshl_b32 s4, s7, 17 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 +; GFX9-NEXT: s_or_b32 s0, s4, s0 +; GFX9-NEXT: v_and_b32_e32 v3, s9, v3 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffffff +; GFX9-NEXT: v_sub_u32_e32 v3, 23, v1 +; GFX9-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_lshl_b32 s0, s8, 17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 1 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 +; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 +; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v4, s11, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s11, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX9-NEXT: v_or3_b32 v0, v2, v0, v4 +; GFX9-NEXT: v_and_or_b32 v1, v3, s11, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v2i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 +; GFX10-NEXT: s_sub_i32 s12, 0, 24 +; GFX10-NEXT: s_movk_i32 s9, 0xff +; GFX10-NEXT: s_lshr_b32 s14, s4, 8 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_lshr_b32 s15, s4, 16 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX10-NEXT: s_and_b32 s14, s14, s9 +; GFX10-NEXT: s_and_b32 s16, s4, s9 +; GFX10-NEXT: s_lshl_b32 s14, s14, s10 +; GFX10-NEXT: s_and_b32 s15, s15, s9 +; GFX10-NEXT: s_or_b32 s14, s16, s14 +; GFX10-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: s_bfe_u32 s14, s14, 0x100000 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s11, s1, 8 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s1, s1, s9 +; GFX10-NEXT: s_and_b32 s6, s6, s9 +; GFX10-NEXT: s_lshr_b32 s8, s0, 24 +; GFX10-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX10-NEXT: s_bfe_u32 s12, s15, 0x100000 +; GFX10-NEXT: s_lshr_b32 s15, s5, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 16 +; GFX10-NEXT: s_and_b32 s5, s5, s9 +; GFX10-NEXT: s_or_b32 s12, s14, s12 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: s_and_b32 s14, s15, s9 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: s_bfe_u32 s5, s14, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s5 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX10-NEXT: s_and_b32 s0, s0, s9 +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s7, s9 +; GFX10-NEXT: s_and_b32 s7, s11, s9 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX10-NEXT: s_lshr_b32 s11, s2, 16 +; GFX10-NEXT: s_and_b32 s13, s2, s9 +; GFX10-NEXT: s_lshl_b32 s5, s8, s10 +; GFX10-NEXT: s_and_b32 s8, s11, s9 +; GFX10-NEXT: s_lshr_b32 s11, s3, 8 +; GFX10-NEXT: s_and_b32 s3, s3, s9 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s12, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_or_b32 s5, s13, s5 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s4, v1 +; GFX10-NEXT: s_mov_b32 s4, 0xffffff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_and_b32 s3, s11, s9 +; GFX10-NEXT: s_or_b32 s5, s5, s8 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, 16 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: s_lshl_b32 s6, s6, 17 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s5 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: v_lshl_or_b32 v0, s0, v2, v0 +; GFX10-NEXT: s_lshl_b32 s0, s7, 17 +; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 +; GFX10-NEXT: s_mov_b32 s0, 8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 +; GFX10-NEXT: v_and_b32_sdwa v4, v1, s9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_b32_sdwa v0, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX10-NEXT: v_or3_b32 v0, v2, v0, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, s9, v4 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog + %lhs = bitcast i48 %lhs.arg to <2 x i24> + %rhs = bitcast i48 %rhs.arg to <2 x i24> + %amt = bitcast i48 %amt.arg to <2 x i24> + %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) + %cast.result = bitcast <2 x i24> %result to i48 + ret i48 %cast.result +} + +define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { +; GFX6-LABEL: v_fshr_v2i24: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX6-NEXT: s_sub_i32 s4, 0, 24 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v8, 24 +; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_mul_lo_u32 v7, s4, v6 +; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; GFX6-NEXT: v_mov_b32_e32 v8, 0xffffff +; GFX6-NEXT: v_and_b32_e32 v5, v5, v8 +; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX6-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX6-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s4, v7 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 23, v4 +; GFX6-NEXT: v_and_b32_e32 v9, v9, v8 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_mul_hi_u32 v6, v7, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX6-NEXT: v_and_b32_e32 v3, v3, v8 +; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX6-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i24: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX8-NEXT: s_sub_i32 s4, 0, 24 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v8, 24 +; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mul_lo_u32 v7, s4, v6 +; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v8 +; GFX8-NEXT: v_mov_b32_e32 v8, 0xffffff +; GFX8-NEXT: v_and_b32_e32 v5, v5, v8 +; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX8-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_mul_lo_u32 v6, s4, v7 +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 23, v4 +; GFX8-NEXT: v_and_b32_e32 v9, v9, v8 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_mul_hi_u32 v6, v7, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v9, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 +; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX8-NEXT: v_and_b32_e32 v3, v3, v8 +; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 +; GFX8-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX8-NEXT: v_and_b32_e32 v4, v4, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX9-NEXT: s_sub_i32 s4, 0, 24 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v8, 24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v8 +; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX9-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffffff +; GFX9-NEXT: v_mul_lo_u32 v7, s4, v6 +; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mul_hi_u32 v7, v6, v7 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, s4, v7 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 +; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 +; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_and_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 +; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, v4, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i24: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 +; GFX10-NEXT: s_sub_i32 s4, 0, 24 +; GFX10-NEXT: v_mov_b32_e32 v12, 0xffffff +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v12 +; GFX10-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX10-NEXT: v_mul_lo_u32 v8, s4, v6 +; GFX10-NEXT: v_mul_lo_u32 v9, s4, v7 +; GFX10-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX10-NEXT: v_mul_hi_u32 v9, v7, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v8 +; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v9 +; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX10-NEXT: v_and_b32_e32 v4, v11, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v11 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX10-NEXT: v_and_b32_e32 v5, v5, v12 +; GFX10-NEXT: v_and_b32_e32 v11, v6, v12 +; GFX10-NEXT: v_and_b32_e32 v4, v7, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, v11, v2 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) + ret <2 x i24> %result +} + +define amdgpu_ps i32 @s_fshr_i32(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { +; GFX6-LABEL: s_fshr_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + ret i32 %result +} + +define amdgpu_ps i32 @s_fshr_i32_5(i32 inreg %lhs, i32 inreg %rhs) { +; GFX6-LABEL: s_fshr_i32_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 5 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i32_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 5 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i32_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 5 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i32_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 5 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5) + ret i32 %result +} + +define amdgpu_ps i32 @s_fshr_i32_8(i32 inreg %lhs, i32 inreg %rhs) { +; GFX6-LABEL: s_fshr_i32_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, 8 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i32_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 8 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i32_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 8 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i32_8: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 8 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8) + ret i32 %result +} + +define i32 @v_fshr_i32(i32 %lhs, i32 %rhs, i32 %amt) { +; GFX6-LABEL: v_fshr_i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + ret i32 %result +} + +define i32 @v_fshr_i32_5(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_fshr_i32_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i32_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i32_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, 5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i32_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, 5 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 5) + ret i32 %result +} + +define i32 @v_fshr_i32_8(i32 %lhs, i32 %rhs) { +; GFX6-LABEL: v_fshr_i32_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v1, 8 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i32_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 8 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i32_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v1, 8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i32_8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v1, 8 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 8) + ret i32 %result +} + +define amdgpu_ps float @v_fshr_i32_ssv(i32 inreg %lhs, i32 inreg %rhs, i32 %amt) { +; GFX6-LABEL: v_fshr_i32_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i32_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i32_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v1, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i32_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + %cast.result = bitcast i32 %result to float + ret float %cast.result +} + +define amdgpu_ps float @v_fshr_i32_svs(i32 inreg %lhs, i32 %rhs, i32 inreg %amt) { +; GFX6-LABEL: v_fshr_i32_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i32_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i32_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i32_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + %cast.result = bitcast i32 %result to float + ret float %cast.result +} + +define amdgpu_ps float @v_fshr_i32_vss(i32 inreg %lhs, i32 inreg %rhs, i32 inreg %amt) { +; GFX6-LABEL: v_fshr_i32_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i32_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i32 @llvm.fshr.i32(i32 %lhs, i32 %rhs, i32 %amt) + %cast.result = bitcast i32 %result to float + ret float %cast.result +} + +define <2 x i32> @v_fshr_v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) { +; GFX6-LABEL: v_fshr_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %lhs, <2 x i32> %rhs, <2 x i32> %amt) + ret <2 x i32> %result +} + +define <3 x i32> @v_fshr_v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) { +; GFX6-LABEL: v_fshr_v3i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v3i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX9-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v4, v7 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v5, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %lhs, <3 x i32> %rhs, <3 x i32> %amt) + ret <3 x i32> %result +} + +define <4 x i32> @v_fshr_v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) { +; GFX6-LABEL: v_fshr_v4i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX6-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX6-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v4i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX9-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX9-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX10-NEXT: v_alignbit_b32 v1, v1, v5, v9 +; GFX10-NEXT: v_alignbit_b32 v2, v2, v6, v10 +; GFX10-NEXT: v_alignbit_b32 v3, v3, v7, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> %amt) + ret <4 x i32> %result +} + +define amdgpu_ps i16 @s_fshr_i16(i16 inreg %lhs, i16 inreg %rhs, i16 inreg %amt) { +; GFX6-LABEL: s_fshr_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s3, s2, 15 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s2 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s3, s2, 15 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s3, s2, 15 +; GFX9-NEXT: s_andn2_b32 s2, 15, s2 +; GFX9-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s2, s3, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s3, s2, 15 +; GFX10-NEXT: s_andn2_b32 s2, 15, s2 +; GFX10-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) + ret i16 %result +} + +define amdgpu_ps i16 @s_fshr_i16_4(i16 inreg %lhs, i16 inreg %rhs) { +; GFX6-LABEL: s_fshr_i16_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 12 +; GFX6-NEXT: s_lshr_b32 s1, s1, 4 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i16_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s2, 12, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, 4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i16_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s2, 12, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s2, 4, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i16_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_u32 s2, 12, 0x100000 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, 4, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4) + ret i16 %result +} + +define amdgpu_ps i16 @s_fshr_i16_5(i16 inreg %lhs, i16 inreg %rhs) { +; GFX6-LABEL: s_fshr_i16_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_lshl_b32 s0, s0, 11 +; GFX6-NEXT: s_lshr_b32 s1, s1, 5 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i16_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s2, 5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i16_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_bfe_u32 s2, 5, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i16_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_bfe_u32 s2, 11, 0x100000 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_bfe_u32 s3, 5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5) + ret i16 %result +} + +define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) { +; GFX6-LABEL: v_fshr_i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_bfe_u32 v2, v3, 0, 16 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, v3, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) + ret i16 %result +} + +define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_fshr_i16_4: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 4, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i16_4: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 12, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 4, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i16_4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 12, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 4, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i16_4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 12, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 4, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4) + ret i16 %result +} + +define i16 @v_fshr_i16_5(i16 %lhs, i16 %rhs) { +; GFX6-LABEL: v_fshr_i16_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 11, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i16_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 11, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 5, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i16_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 11, v0 +; GFX9-NEXT: v_lshrrev_b16_e32 v1, 5, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i16_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 11, v0 +; GFX10-NEXT: v_lshrrev_b16_e64 v1, 5, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5) + ret i16 %result +} + +define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt) { +; GFX6-LABEL: v_fshr_i16_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshl_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff +; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i16_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i16_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_lshlrev_b16_e64 v0, v0, s0 +; GFX9-NEXT: v_lshrrev_b16_e64 v1, v1, s1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i16_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, v0, s1 +; GFX10-NEXT: v_lshlrev_b16_e64 v1, v1, s0 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) + %cast.result = bitcast i16 %result to half + ret half %cast.result +} + +define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt) { +; GFX6-LABEL: v_fshr_i16_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s2, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s1 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i16_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, s2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i16_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_lshrrev_b16_e32 v0, s2, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i16_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b32 s2, s1, 15 +; GFX10-NEXT: s_andn2_b32 s1, 15, s1 +; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX10-NEXT: v_lshrrev_b16_e64 v0, s2, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) + %cast.result = bitcast i16 %result to half + ret half %cast.result +} + +define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt) { +; GFX6-LABEL: v_fshr_i16_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b32 s2, s1, 15 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i16_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b32 s2, s1, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, s1, v0 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i16_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b32 s2, s1, 15 +; GFX9-NEXT: s_andn2_b32 s1, 15, s1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, s1, v0 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x100000 +; GFX9-NEXT: s_lshr_b32 s0, s0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i16_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0 +; GFX10-NEXT: s_andn2_b32 s2, 15, s1 +; GFX10-NEXT: s_and_b32 s1, s1, 15 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_lshlrev_b16_e64 v0, s2, v0 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt) + %cast.result = bitcast i16 %result to half + ret half %cast.result +} + +define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { +; GFX6-LABEL: s_fshr_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: s_lshr_b32 s3, s0, 16 +; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX6-NEXT: s_and_b32 s6, s1, s5 +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 +; GFX6-NEXT: s_lshl_b32 s3, s3, s4 +; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s1, 17 +; GFX6-NEXT: s_lshr_b32 s6, s6, 1 +; GFX6-NEXT: s_lshr_b32 s4, s4, s7 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_lshr_b32 s4, s1, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_and_b32 s7, s2, 15 +; GFX6-NEXT: s_and_b32 s1, s1, s5 +; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_lshr_b32 s6, s2, 16 +; GFX6-NEXT: s_andn2_b32 s2, 15, s2 +; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s0, s0, s7 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_and_b32 s1, s6, 15 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_andn2_b32 s2, 15, s6 +; GFX6-NEXT: s_lshl_b32 s1, s3, s1 +; GFX6-NEXT: s_and_b32 s3, s4, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s3, s2 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s5, 1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s6, s5 +; GFX8-NEXT: s_bfe_u32 s7, 14, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, s5 +; GFX8-NEXT: s_lshr_b32 s6, s6, s7 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_lshr_b32 s6, s4, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s6, s6, s7 +; GFX8-NEXT: s_xor_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s7, s2, 15 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NEXT: s_andn2_b32 s2, 15, s2 +; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s5 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s7 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s6, 15 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, s5 +; GFX8-NEXT: s_andn2_b32 s2, 15, s6 +; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s3, 0xf000f +; GFX9-NEXT: s_and_b32 s4, s2, s3 +; GFX9-NEXT: s_andn2_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s5 +; GFX9-NEXT: s_mov_b32 s3, 0xffff +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_and_b32 s3, s4, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_mov_b32 s3, 0xf000f +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s5, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_andn2_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s2, s3, s4 +; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s3, s5, s3 +; GFX10-NEXT: s_lshr_b32 s5, s5, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_lshr_b32 s3, s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to i32 + ret i32 %cast +} + +define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { +; GFX6-LABEL: v_fshr_v2i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, s5, v1 +; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 17, v1 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v5 +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v5 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v3, s5, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 1, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, 14, v4 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v5, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 14, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, 1, v1 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, v4, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 +; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + ret <2 x i16> %result +} + +define <2 x i16> @v_fshr_v2i16_4_8(<2 x i16> %lhs, <2 x i16> %rhs) { +; GFX6-LABEL: v_fshr_v2i16_4_8: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_sub_i32 s4, 0, 4 +; GFX6-NEXT: s_and_b32 s6, s4, 15 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX6-NEXT: s_xor_b32 s4, s4, -1 +; GFX6-NEXT: s_sub_i32 s5, 0, 8 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: s_and_b32 s4, s5, 15 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_xor_b32 s5, s5, -1 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 17, v1 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i16_4_8: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s4, 0, 4 +; GFX8-NEXT: s_and_b32 s6, s4, 15 +; GFX8-NEXT: s_sub_i32 s5, 0, 8 +; GFX8-NEXT: s_xor_b32 s4, s4, -1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, s4, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, s6, v0 +; GFX8-NEXT: s_and_b32 s4, s5, 15 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: s_xor_b32 s5, s5, -1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, s5, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i16_4_8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX9-NEXT: s_sub_i32 s4, 0, 16 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_hi_u32 v2, 4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, 4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, 8, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, 16, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX9-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i16_4_8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, 16 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 16 +; GFX10-NEXT: s_sub_i32 s4, 0, 16 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, s4, v2 +; GFX10-NEXT: v_mul_lo_u32 v5, s4, v3 +; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4 +; GFX10-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v5 +; GFX10-NEXT: v_mul_hi_u32 v2, 8, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, 4, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 8, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 4, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 16, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 16, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v3, 0xffff, v2 +; GFX10-NEXT: v_pk_sub_i16 v3, 16, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> ) + ret <2 x i16> %result +} + +define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> %amt) { +; GFX6-LABEL: v_fshr_v2i16_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_and_b32 s5, s1, s4 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: s_lshl_b32 s2, s2, s3 +; GFX6-NEXT: s_lshr_b32 s5, s5, 1 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s1, 17 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 +; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshr_b32 s3, s1, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s5 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: s_lshl_b32 s1, s1, 1 +; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: s_and_b32 s0, s1, s4 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX6-NEXT: s_and_b32 s0, s3, s4 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_lshl_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_v2i16_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s5, s5, s4 +; GFX8-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s5, s5, s6 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX8-NEXT: s_lshl_b32 s1, s1, s4 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s0 +; GFX8-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_lshr_b32 s5, s3, s4 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 +; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: s_bfe_u32 s0, s3, 0x100000 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v2, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_lshr_b32 s5, s5, s6 +; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v2, v2, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, v1, s0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_v2i16_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v1, s2, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_v2i16_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX10-NEXT: s_lshl_b32 s2, s3, 1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, v0, s1 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to float + ret float %cast +} + +define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { +; GFX6-LABEL: v_fshr_v2i16_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: v_and_b32_e32 v1, s4, v0 +; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 17, v0 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, s5, v1 +; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_lshl_b32 s0, s2, s3 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s5, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX6-NEXT: s_xor_b32 s0, s1, -1 +; GFX6-NEXT: s_and_b32 s2, s0, 15 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX6-NEXT: s_lshr_b32 s1, s0, 16 +; GFX6-NEXT: s_andn2_b32 s0, 15, s0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s1, 15 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v2 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_v2i16_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v1 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_lshl_b32 s0, s2, s3 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 14, v3 +; GFX8-NEXT: v_or_b32_e32 v3, s0, v3 +; GFX8-NEXT: s_xor_b32 s0, s1, -1 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_and_b32 s2, s0, 15 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_andn2_b32 s0, 15, s0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, s0, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s2, v1 +; GFX8-NEXT: s_and_b32 s0, s1, 15 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v2, s0, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, s1, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_v2i16_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, 0xf000f +; GFX9-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s2, s2, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s1, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s3, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_v2i16_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s4, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to float + ret float %cast +} + +define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { +; GFX6-LABEL: v_fshr_v2i16_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s3, 0xffff +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: s_bfe_u32 s2, 1, 0x100000 +; GFX6-NEXT: s_and_b32 s4, s0, s3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s2, v1 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 +; GFX6-NEXT: s_lshr_b32 s2, s0, 17 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_lshr_b32 s2, s2, s5 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: v_or_b32_e32 v1, s2, v1 +; GFX6-NEXT: s_lshr_b32 s2, s0, 16 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s5, s1, 15 +; GFX6-NEXT: s_and_b32 s0, s0, s3 +; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX6-NEXT: s_lshr_b32 s4, s1, 16 +; GFX6-NEXT: s_andn2_b32 s1, 15, s1 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: s_and_b32 s0, s4, 15 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_andn2_b32 s1, 15, s4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 +; GFX6-NEXT: s_and_b32 s0, s2, s3 +; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshr_b32 s0, s0, s1 +; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 +; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_v2i16_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x100000 +; GFX8-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_bfe_u32 s5, 14, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v0 +; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX8-NEXT: s_lshr_b32 s3, s2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, s4 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: s_xor_b32 s1, s1, -1 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX8-NEXT: s_and_b32 s5, s1, 15 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_andn2_b32 s1, 15, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, s5, v1 +; GFX8-NEXT: v_or_b32_e32 v1, s0, v1 +; GFX8-NEXT: s_and_b32 s0, s3, 15 +; GFX8-NEXT: s_lshl_b32 s2, s2, s4 +; GFX8-NEXT: s_andn2_b32 s1, 15, s3 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, s0, v0 +; GFX8-NEXT: s_bfe_u32 s0, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s4 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshr_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_v2i16_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s2, 0xf000f +; GFX9-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s1, v0 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_and_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_v2i16_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_mov_b32 s2, 0xf000f +; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: s_and_b32 s4, s1, s2 +; GFX10-NEXT: s_andn2_b32 s1, s2, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s1, v0 +; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s1, s4, s3 +; GFX10-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10-NEXT: s_lshr_b32 s0, s0, s1 +; GFX10-NEXT: s_lshr_b32 s1, s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) + %cast = bitcast <2 x i16> %result to float + ret float %cast +} + +; ; FIXME +; define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) { +; %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) +; %cast = bitcast <3 x i16> %result to i48 +; ret i48 %cast +; } + +; ; FIXME +; define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) { +; %result = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt) +; %cast.result = bitcast <3 x i16> %result to <3 x half> +; ret <3 x half> %cast.result +; } + +define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { +; GFX6-LABEL: s_fshr_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_mov_b32 s12, 0xffff +; GFX6-NEXT: s_lshl_b32 s9, s9, 16 +; GFX6-NEXT: s_and_b32 s8, s8, s12 +; GFX6-NEXT: s_or_b32 s8, s9, s8 +; GFX6-NEXT: s_lshl_b32 s9, s11, 16 +; GFX6-NEXT: s_and_b32 s11, s4, s12 +; GFX6-NEXT: s_and_b32 s10, s10, s12 +; GFX6-NEXT: s_or_b32 s9, s9, s10 +; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 +; GFX6-NEXT: s_lshr_b32 s11, s11, 1 +; GFX6-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s10 +; GFX6-NEXT: s_lshr_b32 s11, s11, s13 +; GFX6-NEXT: s_or_b32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s11, s5, s12 +; GFX6-NEXT: s_lshr_b32 s11, s11, 1 +; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_xor_b32 s8, s8, -1 +; GFX6-NEXT: s_lshl_b32 s1, s1, s10 +; GFX6-NEXT: s_lshr_b32 s11, s11, s13 +; GFX6-NEXT: s_and_b32 s14, s8, 15 +; GFX6-NEXT: s_and_b32 s4, s4, s12 +; GFX6-NEXT: s_or_b32 s1, s1, s11 +; GFX6-NEXT: s_lshr_b32 s11, s8, 16 +; GFX6-NEXT: s_andn2_b32 s8, 15, s8 +; GFX6-NEXT: s_bfe_u32 s14, s14, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, s8 +; GFX6-NEXT: s_lshl_b32 s0, s0, s14 +; GFX6-NEXT: s_or_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s4, s11, 15 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s5, s5, 1 +; GFX6-NEXT: s_lshl_b32 s1, s1, s4 +; GFX6-NEXT: s_and_b32 s4, s5, s12 +; GFX6-NEXT: s_andn2_b32 s8, 15, s11 +; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_or_b32 s1, s1, s4 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s2, s10 +; GFX6-NEXT: s_and_b32 s2, s6, s12 +; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_lshr_b32 s2, s2, s13 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s3, s10 +; GFX6-NEXT: s_and_b32 s3, s7, s12 +; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_lshr_b32 s3, s3, s13 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s6, 1 +; GFX6-NEXT: s_xor_b32 s5, s9, -1 +; GFX6-NEXT: s_and_b32 s3, s3, s12 +; GFX6-NEXT: s_lshl_b32 s4, s7, 1 +; GFX6-NEXT: s_and_b32 s7, s5, 15 +; GFX6-NEXT: s_lshr_b32 s6, s5, 16 +; GFX6-NEXT: s_andn2_b32 s5, 15, s5 +; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 +; GFX6-NEXT: s_lshl_b32 s1, s1, s7 +; GFX6-NEXT: s_or_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 15 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX6-NEXT: s_lshl_b32 s2, s2, s3 +; GFX6-NEXT: s_and_b32 s3, s4, s12 +; GFX6-NEXT: s_andn2_b32 s5, 15, s6 +; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX6-NEXT: s_lshl_b32 s2, s2, 16 +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_bfe_u32 s8, 1, 0x100000 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x100000 +; GFX8-NEXT: s_lshr_b32 s9, s9, s8 +; GFX8-NEXT: s_bfe_u32 s10, 14, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_lshl_b32 s0, s0, s8 +; GFX8-NEXT: s_lshr_b32 s9, s9, s10 +; GFX8-NEXT: s_or_b32 s0, s0, s9 +; GFX8-NEXT: s_lshr_b32 s9, s7, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_xor_b32 s4, s4, -1 +; GFX8-NEXT: s_lshl_b32 s6, s6, s8 +; GFX8-NEXT: s_lshr_b32 s9, s9, s10 +; GFX8-NEXT: s_and_b32 s11, s4, 15 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_or_b32 s6, s6, s9 +; GFX8-NEXT: s_lshr_b32 s9, s4, 16 +; GFX8-NEXT: s_andn2_b32 s4, 15, s4 +; GFX8-NEXT: s_bfe_u32 s11, s11, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s2, s2, s4 +; GFX8-NEXT: s_lshl_b32 s0, s0, s11 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s9, 15 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_lshl_b32 s7, s7, s8 +; GFX8-NEXT: s_andn2_b32 s4, 15, s9 +; GFX8-NEXT: s_lshl_b32 s2, s6, s2 +; GFX8-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s6, s8 +; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX8-NEXT: s_lshr_b32 s4, s6, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s6, s6, s8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s2, s1, 16 +; GFX8-NEXT: s_lshr_b32 s4, s3, 16 +; GFX8-NEXT: s_lshl_b32 s1, s1, s8 +; GFX8-NEXT: s_lshr_b32 s6, s6, s10 +; GFX8-NEXT: s_or_b32 s1, s1, s6 +; GFX8-NEXT: s_lshr_b32 s6, s4, s8 +; GFX8-NEXT: s_lshl_b32 s3, s3, s8 +; GFX8-NEXT: s_xor_b32 s5, s5, -1 +; GFX8-NEXT: s_lshl_b32 s2, s2, s8 +; GFX8-NEXT: s_lshr_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s7, s5, 15 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_lshr_b32 s6, s5, 16 +; GFX8-NEXT: s_andn2_b32 s5, 15, s5 +; GFX8-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s8 +; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, s7 +; GFX8-NEXT: s_or_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s3, s6, 15 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshl_b32 s4, s4, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_bfe_u32 s3, s4, 0x100000 +; GFX8-NEXT: s_andn2_b32 s5, 15, s6 +; GFX8-NEXT: s_lshr_b32 s3, s3, s8 +; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshr_b32 s9, s0, 16 +; GFX9-NEXT: s_mov_b32 s8, 0x10001 +; GFX9-NEXT: s_mov_b32 s6, 0xf000f +; GFX9-NEXT: s_lshl_b32 s0, s0, s8 +; GFX9-NEXT: s_lshl_b32 s9, s9, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s7, s4, s6 +; GFX9-NEXT: s_andn2_b32 s4, s6, s4 +; GFX9-NEXT: s_lshr_b32 s9, s0, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s4 +; GFX9-NEXT: s_lshl_b32 s4, s9, s10 +; GFX9-NEXT: s_mov_b32 s9, 0xffff +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_lshr_b32 s10, s7, 16 +; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s7, s7, s9 +; GFX9-NEXT: s_lshr_b32 s2, s2, s7 +; GFX9-NEXT: s_lshr_b32 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s5, s6 +; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, s8 +; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, s4 +; GFX9-NEXT: s_lshl_b32 s4, s5, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 +; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s3, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_mov_b32 s7, 0x10001 +; GFX10-NEXT: s_mov_b32 s6, 0xf000f +; GFX10-NEXT: s_lshl_b32 s0, s0, s7 +; GFX10-NEXT: s_lshl_b32 s8, s8, 1 +; GFX10-NEXT: s_and_b32 s9, s4, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_andn2_b32 s4, s6, s4 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_lshl_b32 s4, s8, s10 +; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s7 +; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s7, s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_andn2_b32 s4, s6, s5 +; GFX10-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 +; GFX10-NEXT: s_and_b32 s11, s9, s8 +; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_lshr_b32 s9, s9, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, s4 +; GFX10-NEXT: s_lshl_b32 s4, s5, s6 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_and_b32 s6, s7, s8 +; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_lshr_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, s11 +; GFX10-NEXT: s_lshr_b32 s9, s10, s9 +; GFX10-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s5, s5, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: ; return to shader part epilog + %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) + %cast.result = bitcast <4 x i16> %result to <2 x i32> + ret <2 x i32> %cast.result +} + +define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) { +; GFX6-LABEL: v_fshr_v4i16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX6-NEXT: v_and_b32_e32 v8, v8, v12 +; GFX6-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX6-NEXT: v_and_b32_e32 v10, v10, v12 +; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX6-NEXT: v_and_b32_e32 v10, s5, v4 +; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 1, v10 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, s6, v10 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX6-NEXT: v_and_b32_e32 v10, s5, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 1, v10 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, s6, v10 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v4, s5, v5 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX6-NEXT: v_and_b32_e32 v4, s5, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v4, s5, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 +; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v7 +; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_and_b32_e32 v4, s5, v4 +; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 +; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v4, s5, v5 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 14, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_mov_b32_e32 v7, 1 +; GFX8-NEXT: v_lshrrev_b16_sdwa v8, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 14, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX8-NEXT: v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v4, v4, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v10, v6 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v8, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_e32 v4, 1, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 14, v6 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 14, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v6 +; GFX8-NEXT: v_lshlrev_b16_e32 v4, v8, v4 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v7 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v5, v1 +; GFX8-NEXT: v_lshrrev_b16_e32 v3, v6, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v6, s4, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v5 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v2, s4, v5 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v4, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, v2, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 +; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) + %cast.result = bitcast <4 x i16> %result to <4 x half> + ret <4 x half> %cast.result +} + +define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) { +; GFX6-LABEL: s_fshr_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 63 +; GFX6-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 63 +; GFX8-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 63 +; GFX9-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[6:7], 63, s[4:5] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], 63 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) + ret i64 %result +} + +define amdgpu_ps i64 @s_fshr_i64_5(i64 inreg %lhs, i64 inreg %rhs) { +; GCN-LABEL: s_fshr_i64_5: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s1, s0, 27 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 5 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5) + ret i64 %result +} + +define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) { +; GCN-LABEL: s_fshr_i64_32: +; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: s_mov_b32 s0, 0 +; GCN-NEXT: s_mov_b32 s2, s3 +; GCN-NEXT: s_mov_b32 s3, s0 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32) + ret i64 %result +} + +define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) { +; GCN-LABEL: s_fshr_i64_48: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshr_b32 s2, s3, 16 +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GCN-NEXT: s_mov_b32 s3, 0 +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48) + ret i64 %result +} + +define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { +; GFX6-LABEL: v_fshr_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX8-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v4 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 63, v5 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) + ret i64 %result +} + +define i64 @v_fshr_i64_5(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_fshr_i64_5: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[2:3], 5 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 27, v4 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i64_5: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 27, v4 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i64_5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 27, v4 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i64_5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v4, v0 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], 5, v[2:3] +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 27, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 5) + ret i64 %result +} + +define i64 @v_fshr_i64_32(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_fshr_i64_32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i64_32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i64_32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i64_32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32) + ret i64 %result +} + +define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) { +; GFX6-LABEL: v_fshr_i64_48: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 16 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i64_48: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i64_48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i64_48: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48) + ret i64 %result +} + +define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) { +; GFX6-LABEL: v_fshr_i64_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v0 +; GFX6-NEXT: v_lshr_b64 v[2:3], s[2:3], v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i64_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i64_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i64_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_and_b32_e32 v2, 63, v1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) + %cast = bitcast i64 %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) { +; GFX6-LABEL: v_fshr_i64_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i64_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i64_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i64_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) + %cast = bitcast i64 %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) { +; GFX6-LABEL: v_fshr_i64_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s2 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i64_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i64_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 +; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i64_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3] +; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 +; GFX10-NEXT: ; return to shader part epilog + %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) + %cast = bitcast i64 %result to <2 x float> + ret <2 x float> %cast +} + +define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) { +; GFX6-LABEL: s_fshr_v2i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 +; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v2i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 +; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 +; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 +; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b64 s[12:13], 63, s[8:9] +; GFX10-NEXT: s_and_b64 s[8:9], s[8:9], 63 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_and_b64 s[10:11], s[10:11], 63 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) + ret <2 x i64> %result +} + +define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { +; GFX6-LABEL: v_fshr_v2i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v8 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v10 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 +; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: v_and_b32_e32 v19, 63, v8 +; GFX10-NEXT: v_and_b32_e32 v15, 63, v9 +; GFX10-NEXT: v_and_b32_e32 v9, 63, v11 +; GFX10-NEXT: v_and_b32_e32 v13, 63, v10 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], v19, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v15, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v9, v[2:3] +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, v[6:7] +; GFX10-NEXT: v_or_b32_e32 v0, v11, v4 +; GFX10-NEXT: v_or_b32_e32 v1, v12, v5 +; GFX10-NEXT: v_or_b32_e32 v2, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v16, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) + ret <2 x i64> %result +} + +define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { +; GFX6-LABEL: s_fshr_i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s10, 0x7f +; GFX6-NEXT: s_mov_b32 s11, 0 +; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX6-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_sub_i32 s9, 1, 64 +; GFX6-NEXT: s_sub_i32 s13, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s18, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s19, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[0:1], s13 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], 1 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s13, s8, 64 +; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s8 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[10:11], s9 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX6-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s13 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] +; GFX6-NEXT: s_sub_i32 s14, s12, 64 +; GFX6-NEXT: s_sub_i32 s13, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX6-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s10, 0x7f +; GFX8-NEXT: s_mov_b32 s11, 0 +; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX8-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_sub_i32 s9, 1, 64 +; GFX8-NEXT: s_sub_i32 s13, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[0:1], s13 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[2:3], 1 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s13, s8, 64 +; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s8 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[10:11], s9 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX8-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s13 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] +; GFX8-NEXT: s_sub_i32 s14, s12, 64 +; GFX8-NEXT: s_sub_i32 s13, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX8-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s10, 0x7f +; GFX9-NEXT: s_mov_b32 s11, 0 +; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX9-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_sub_i32 s9, 1, 64 +; GFX9-NEXT: s_sub_i32 s13, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s18, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s19, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[0:1], s13 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[2:3], 1 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s13, s8, 64 +; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s8 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[10:11], s9 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 +; GFX9-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s13 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] +; GFX9-NEXT: s_sub_i32 s14, s12, 64 +; GFX9-NEXT: s_sub_i32 s13, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s12 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s12 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s13 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s10, 0x7f +; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], s[10:11] +; GFX10-NEXT: s_andn2_b64 s[8:9], s[10:11], s[8:9] +; GFX10-NEXT: s_sub_i32 s9, 1, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], 1 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], 1 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s18, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s13, s8, 64 +; GFX10-NEXT: s_sub_i32 s2, 64, s8 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s8 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[14:15], s2 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[10:11], s[14:15], s13 +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s12 +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) + ret i128 %result +} + +define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { +; GFX6-LABEL: v_fshr_i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_movk_i32 s4, 0x7f +; GFX6-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX6-NEXT: s_sub_i32 s5, 64, 1 +; GFX6-NEXT: s_sub_i32 s4, 1, 64 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], s5 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], 1 +; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], 1 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v15 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[8:9], v2 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[0:1], v15 +; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_lshl_b64 v[12:13], v[8:9], v15 +; GFX6-NEXT: v_or_b32_e32 v10, v2, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v3, v11 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[8:9], v16 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v11, v3, v1, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], v14 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 +; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], v15 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], v14 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_movk_i32 s4, 0x7f +; GFX8-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX8-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX8-NEXT: s_sub_i32 s5, 64, 1 +; GFX8-NEXT: s_sub_i32 s4, 1, 64 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[12:13], 1, v[0:1] +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v15 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9] +; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v3, v11 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v16, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s4, 0x7f +; GFX9-NEXT: v_and_b32_e32 v14, s4, v8 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX9-NEXT: v_and_b32_e32 v15, s4, v8 +; GFX9-NEXT: s_sub_i32 s5, 64, 1 +; GFX9-NEXT: s_sub_i32 s4, 1, 64 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[12:13], 1, v[0:1] +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v15 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[0:1] +; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v10, v2, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v3, v11 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v16, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v0, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v1, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v13, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_sub_i32 s4, 64, 1 +; GFX10-NEXT: s_sub_i32 s6, 1, 64 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[9:10], s4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[11:12], 1, v[2:3] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[13:14], 1, v[0:1] +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] +; GFX10-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX10-NEXT: v_xor_b32_e32 v15, -1, v8 +; GFX10-NEXT: s_movk_i32 s5, 0x7f +; GFX10-NEXT: s_and_b32 s6, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s6 +; GFX10-NEXT: v_and_b32_e32 v19, s5, v15 +; GFX10-NEXT: v_and_b32_e32 v20, s5, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 +; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v19 +; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v20 +; GFX10-NEXT: v_mov_b32_e32 v25, v4 +; GFX10-NEXT: v_mov_b32_e32 v26, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v11, v[9:10] +; GFX10-NEXT: v_lshlrev_b64 v[11:12], v19, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[13:14], v19, v[9:10] +; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v20 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, v[9:10] +; GFX10-NEXT: v_lshrrev_b64 v[15:16], v20, v[25:26] +; GFX10-NEXT: v_lshlrev_b64 v[17:18], v17, v[6:7] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v10, v3, v12 +; GFX10-NEXT: v_or_b32_e32 v11, v2, v11 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v21, v[6:7] +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v13, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v13, v15, v17 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v10, v16, v18 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v20, v[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v8, v11, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v10, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v15, v1, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v0, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v25, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v26, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v7, s4 +; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v8, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) + ret i128 %result +} + +define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { +; GFX6-LABEL: v_fshr_i128_ssv: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sub_i32 s14, 1, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_movk_i32 s8, 0x7f +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX6-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: v_lshr_b64 v[0:1], s[8:9], v0 +; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_lshl_b64 v[4:5], s[8:9], v7 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshl_b64 v[0:1], s[8:9], v8 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v6 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v6 +; GFX6-NEXT: v_lshl_b64 v[2:3], s[6:7], v2 +; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v6 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i128_ssv: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_sub_i32 s14, 1, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_movk_i32 s8, 0x7f +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX8-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, s[8:9] +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v6 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i128_ssv: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s14, 1, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_movk_i32 s8, 0x7f +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_and_b32_e32 v6, s8, v0 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX9-NEXT: v_and_b32_e32 v7, s8, v0 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] +; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, s[8:9] +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] +; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v6 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i128_ssv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 +; GFX10-NEXT: s_sub_i32 s14, 1, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_movk_i32 s8, 0x7f +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_and_b32_e32 v13, s8, v1 +; GFX10-NEXT: v_and_b32_e32 v12, s8, v0 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], 1 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v12 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[2:3], s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v13, s[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], v12, s[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v13 +; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v12 +; GFX10-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v12 +; GFX10-NEXT: v_lshlrev_b64 v[15:16], v10, s[10:11] +; GFX10-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 +; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], v13, s[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v8, v15, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v16, v3, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v12, s[6:7] +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v13 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v19, v8, s8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, s9, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX10-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX10-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX10-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) + %cast.result = bitcast i128 %result to <4 x float> + ret <4 x float> %cast.result +} + +define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { +; GFX6-LABEL: v_fshr_i128_svs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_sub_i32 s5, 1, 64 +; GFX6-NEXT: s_sub_i32 s9, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s14, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s9, s4, 64 +; GFX6-NEXT: s_sub_i32 s5, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s13, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s4 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[6:7], s5 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX6-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s9 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX6-NEXT: s_sub_i32 s5, 64, s8 +; GFX6-NEXT: s_sub_i32 s4, s8, 64 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s8 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s8 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s1, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i128_svs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_sub_i32 s5, 1, 64 +; GFX8-NEXT: s_sub_i32 s9, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s14, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX8-NEXT: s_cmp_lg_u32 s14, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s9, s4, 64 +; GFX8-NEXT: s_sub_i32 s5, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b32 s13, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s4 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[6:7], s5 +; GFX8-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX8-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s9 +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_sub_i32 s5, 64, s8 +; GFX8-NEXT: s_sub_i32 s4, s8, 64 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX8-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s1, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i128_svs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_sub_i32 s5, 1, 64 +; GFX9-NEXT: s_sub_i32 s9, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s14, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], 1 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX9-NEXT: s_cmp_lg_u32 s14, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s9, s4, 64 +; GFX9-NEXT: s_sub_i32 s5, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s13, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s4 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[6:7], s5 +; GFX9-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GFX9-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s9 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] +; GFX9-NEXT: s_sub_i32 s5, 64, s8 +; GFX9-NEXT: s_sub_i32 s4, s8, 64 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s8, v[2:3] +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX9-NEXT: v_or_b32_e32 v2, s0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s1, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i128_svs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s6, 0x7f +; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX10-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_i32 s5, 1, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s14, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], 1 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s14, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s9, s4, 64 +; GFX10-NEXT: s_sub_i32 s2, 64, s4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 +; GFX10-NEXT: s_lshl_b64 s[4:5], s[10:11], s4 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: s_lshl_b64 s[6:7], s[10:11], s9 +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_sub_i32 s0, 64, s8 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] +; GFX10-NEXT: s_sub_i32 s0, s8, 64 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], s0, v[2:3] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s1, 1, vcc_lo +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s8, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e32 v7, v11, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, v12, v5, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) + %cast.result = bitcast i128 %result to <4 x float> + ret <4 x float> %cast.result +} + +define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { +; GFX6-LABEL: v_fshr_i128_vss: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s6, 0x7f +; GFX6-NEXT: s_mov_b32 s7, 0 +; GFX6-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX6-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_sub_i32 s6, 64, 1 +; GFX6-NEXT: s_sub_i32 s5, 1, 64 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 +; GFX6-NEXT: s_and_b32 s5, 1, s7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_sub_i32 s5, s4, 64 +; GFX6-NEXT: s_sub_i32 s6, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[4:5], s6 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], s4 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s9 +; GFX6-NEXT: s_sub_i32 s10, s8, 64 +; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: v_or_b32_e32 v6, v2, v6 +; GFX6-NEXT: v_or_b32_e32 v7, v3, v7 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[4:5], s5 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s12, 1, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX6-NEXT: v_or_b32_e32 v0, s0, v4 +; GFX6-NEXT: v_or_b32_e32 v1, s1, v5 +; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fshr_i128_vss: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s6, 0x7f +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX8-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_sub_i32 s6, 64, 1 +; GFX8-NEXT: s_sub_i32 s5, 1, 64 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[2:3] +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX8-NEXT: s_and_b32 s5, 1, s7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_sub_i32 s5, s4, 64 +; GFX8-NEXT: s_sub_i32 s6, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s6, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s9 +; GFX8-NEXT: s_sub_i32 s10, s8, 64 +; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: v_or_b32_e32 v6, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v7, v3, v7 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], s5, v[4:5] +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s12, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v4 +; GFX8-NEXT: v_or_b32_e32 v1, s1, v5 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fshr_i128_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s6, 0x7f +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX9-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_sub_i32 s6, 64, 1 +; GFX9-NEXT: s_sub_i32 s5, 1, 64 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[2:3] +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] +; GFX9-NEXT: s_and_b32 s5, 1, s7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 +; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_sub_i32 s5, s4, 64 +; GFX9-NEXT: s_sub_i32 s6, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s6, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s9 +; GFX9-NEXT: s_sub_i32 s10, s8, 64 +; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: v_or_b32_e32 v6, v2, v6 +; GFX9-NEXT: v_or_b32_e32 v7, v3, v7 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], s5, v[4:5] +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s12, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s9 +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v4 +; GFX9-NEXT: v_or_b32_e32 v1, s1, v5 +; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: v_fshr_i128_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s6, 0x7f +; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[2:3] +; GFX10-NEXT: s_and_b64 s[8:9], s[4:5], s[6:7] +; GFX10-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_i32 s4, 64, 1 +; GFX10-NEXT: s_sub_i32 s5, 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshlrev_b64 v[13:14], s5, v[0:1] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v6, v4, v6 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX10-NEXT: s_and_b32 s5, 1, s7 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v13, v6, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v5, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc_lo +; GFX10-NEXT: s_sub_i32 s5, s6, 64 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 +; GFX10-NEXT: s_sub_i32 s4, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: v_lshrrev_b64 v[11:12], s4, v[4:5] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s6, v[4:5] +; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b64 v[4:5], s5, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v2, v11, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v12, v7 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: s_sub_i32 s10, s8, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v3, s4 +; GFX10-NEXT: s_and_b32 s4, 1, s6 +; GFX10-NEXT: s_sub_i32 s6, 64, s8 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 +; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 +; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) + %cast.result = bitcast i128 %result to <4 x float> + ret <4 x float> %cast.result +} + +define amdgpu_ps i128 @s_fshr_i128_65(i128 inreg %lhs, i128 inreg %rhs) { +; GFX6-LABEL: s_fshr_i128_65: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_sub_i32 s14, 63, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, 63 +; GFX6-NEXT: s_cmp_lt_u32 63, 64 +; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 63, 0 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX6-NEXT: s_lshl_b32 s13, s2, 31 +; GFX6-NEXT: s_mov_b32 s12, s8 +; GFX6-NEXT: s_lshl_b32 s9, s0, 31 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_movk_i32 s10, 0x41 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s14, s10, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cselect_b32 s16, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[8:9], s[4:5] +; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_i128_65: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_sub_i32 s14, 63, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, 63 +; GFX8-NEXT: s_cmp_lt_u32 63, 64 +; GFX8-NEXT: s_mov_b32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 63, 0 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX8-NEXT: s_lshl_b32 s13, s2, 31 +; GFX8-NEXT: s_mov_b32 s12, s8 +; GFX8-NEXT: s_lshl_b32 s9, s0, 31 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_movk_i32 s10, 0x41 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s14, s10, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cselect_b32 s16, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[8:9], s[4:5] +; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_i128_65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_sub_i32 s14, 63, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, 63 +; GFX9-NEXT: s_cmp_lt_u32 63, 64 +; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 63, 0 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX9-NEXT: s_lshl_b32 s13, s2, 31 +; GFX9-NEXT: s_mov_b32 s12, s8 +; GFX9-NEXT: s_lshl_b32 s9, s0, 31 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_movk_i32 s10, 0x41 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s14, s10, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s16, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[8:9], s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_i128_65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sub_i32 s14, 63, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, 63 +; GFX10-NEXT: s_cmp_lt_u32 63, 64 +; GFX10-NEXT: s_mov_b32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 63, 0 +; GFX10-NEXT: s_mov_b32 s12, s8 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s9 +; GFX10-NEXT: s_lshl_b32 s13, s2, 31 +; GFX10-NEXT: s_lshl_b32 s9, s0, 31 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_movk_i32 s12, 0x41 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s14, s12, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s12 +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 +; GFX10-NEXT: s_cselect_b32 s15, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s12 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[6:7], s12 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] +; GFX10-NEXT: s_cmp_lg_u32 s16, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: ; return to shader part epilog + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) + ret i128 %result +} + +define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { +; GFX6-LABEL: v_fshr_i128_65: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_sub_i32 s5, 64, 63 +; GFX6-NEXT: s_sub_i32 s4, 63, 64 +; GFX6-NEXT: s_cmp_lt_u32 63, 64 +; GFX6-NEXT: s_cselect_b32 s6, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 63, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], s5 +; GFX6-NEXT: v_lshlrev_b32_e32 v10, 31, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v11, 31, v2 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s6 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_movk_i32 s4, 0x41 +; GFX6-NEXT: s_sub_i32 s5, s4, 64 +; GFX6-NEXT: s_sub_i32 s6, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s4 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], s6 +; GFX6-NEXT: s_cselect_b32 s7, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[6:7], s4 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], s5 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s8 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s7 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_i128_65: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s5, 64, 63 +; GFX8-NEXT: s_sub_i32 s4, 63, 64 +; GFX8-NEXT: s_cmp_lt_u32 63, 64 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 63, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 31, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 31, v2 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_and_b32 s4, 1, s6 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_movk_i32 s4, 0x41 +; GFX8-NEXT: s_sub_i32 s5, s4, 64 +; GFX8-NEXT: s_sub_i32 s6, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s4, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[2:3], s6, v[6:7] +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[6:7] +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s5, v[6:7] +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s8 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s7 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i128_65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s5, 64, 63 +; GFX9-NEXT: s_sub_i32 s4, 63, 64 +; GFX9-NEXT: s_cmp_lt_u32 63, 64 +; GFX9-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 63, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s5, v[0:1] +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 31, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 31, v2 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_and_b32 s4, 1, s6 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_movk_i32 s4, 0x41 +; GFX9-NEXT: s_sub_i32 s5, s4, 64 +; GFX9-NEXT: s_sub_i32 s6, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s4, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], s6, v[6:7] +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[6:7] +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s5, v[6:7] +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s8 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_i128_65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_sub_i32 s4, 64, 63 +; GFX10-NEXT: s_sub_i32 s5, 63, 64 +; GFX10-NEXT: s_cmp_lt_u32 63, 64 +; GFX10-NEXT: v_lshlrev_b32_e32 v11, 31, v0 +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 63, 0 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s4, v[0:1] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[14:15], s5, v[0:1] +; GFX10-NEXT: s_and_b32 s5, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 31, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v11, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: s_movk_i32 s6, 0x41 +; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: s_sub_i32 s5, 64, s6 +; GFX10-NEXT: v_or_b32_e32 v12, v9, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v19, v14, v8, vcc_lo +; GFX10-NEXT: v_lshlrev_b64 v[10:11], s5, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s6, v[4:5] +; GFX10-NEXT: s_sub_i32 s5, s6, 64 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[15:16], s5, v[6:7] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX10-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_lshrrev_b64 v[6:7], s6, v[6:7] +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: s_and_b32 s6, 1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v15, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v9, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, s5 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v19, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v12, v3, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v5, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v4, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v7, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 65) + ret i128 %result +} + +define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { +; GFX6-LABEL: s_fshr_v2i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_movk_i32 s18, 0x7f +; GFX6-NEXT: s_mov_b32 s19, 0 +; GFX6-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX6-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX6-NEXT: s_sub_i32 s30, 1, 64 +; GFX6-NEXT: s_sub_i32 s31, 64, 1 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s23, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s31 +; GFX6-NEXT: s_lshl_b64 s[28:29], s[2:3], 1 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s30 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[26:27], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s23, 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: s_sub_i32 s23, s16, 64 +; GFX6-NEXT: s_sub_i32 s17, 64, s16 +; GFX6-NEXT: s_cmp_lt_u32 s16, 64 +; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_cselect_b32 s29, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 +; GFX6-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 +; GFX6-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] +; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] +; GFX6-NEXT: s_sub_i32 s26, s22, 64 +; GFX6-NEXT: s_sub_i32 s24, 64, s22 +; GFX6-NEXT: s_cmp_lt_u32 s22, 64 +; GFX6-NEXT: s_cselect_b32 s27, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s22, 0 +; GFX6-NEXT: s_cselect_b32 s28, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 +; GFX6-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 +; GFX6-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s28, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s27, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] +; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX6-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[18:19], s[4:5], s31 +; GFX6-NEXT: s_lshl_b64 s[20:21], s[6:7], 1 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX6-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s30 +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[18:19], s[4:5] +; GFX6-NEXT: s_cmp_lg_u32 s11, 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX6-NEXT: s_sub_i32 s9, s10, 64 +; GFX6-NEXT: s_sub_i32 s11, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cselect_b32 s21, 1, 0 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX6-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] +; GFX6-NEXT: s_sub_i32 s18, s8, 64 +; GFX6-NEXT: s_sub_i32 s16, 64, s8 +; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b32 s19, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s20, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] +; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshr_v2i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_movk_i32 s18, 0x7f +; GFX8-NEXT: s_mov_b32 s19, 0 +; GFX8-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX8-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX8-NEXT: s_sub_i32 s30, 1, 64 +; GFX8-NEXT: s_sub_i32 s31, 64, 1 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s23, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s31 +; GFX8-NEXT: s_lshl_b64 s[28:29], s[2:3], 1 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s30 +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[26:27], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s23, 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_sub_i32 s23, s16, 64 +; GFX8-NEXT: s_sub_i32 s17, 64, s16 +; GFX8-NEXT: s_cmp_lt_u32 s16, 64 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 +; GFX8-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 +; GFX8-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] +; GFX8-NEXT: s_sub_i32 s26, s22, 64 +; GFX8-NEXT: s_sub_i32 s24, 64, s22 +; GFX8-NEXT: s_cmp_lt_u32 s22, 64 +; GFX8-NEXT: s_cselect_b32 s27, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s22, 0 +; GFX8-NEXT: s_cselect_b32 s28, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 +; GFX8-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 +; GFX8-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s28, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s27, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] +; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX8-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[18:19], s[4:5], s31 +; GFX8-NEXT: s_lshl_b64 s[20:21], s[6:7], 1 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX8-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s30 +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[18:19], s[4:5] +; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX8-NEXT: s_sub_i32 s9, s10, 64 +; GFX8-NEXT: s_sub_i32 s11, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX8-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] +; GFX8-NEXT: s_sub_i32 s18, s8, 64 +; GFX8-NEXT: s_sub_i32 s16, 64, s8 +; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s20, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshr_v2i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_movk_i32 s18, 0x7f +; GFX9-NEXT: s_mov_b32 s19, 0 +; GFX9-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX9-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX9-NEXT: s_sub_i32 s30, 1, 64 +; GFX9-NEXT: s_sub_i32 s31, 64, 1 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s23, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s31 +; GFX9-NEXT: s_lshl_b64 s[28:29], s[2:3], 1 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[26:27], s[26:27], s[28:29] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s30 +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[26:27], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s23, 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: s_sub_i32 s23, s16, 64 +; GFX9-NEXT: s_sub_i32 s17, 64, s16 +; GFX9-NEXT: s_cmp_lt_u32 s16, 64 +; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_cselect_b32 s29, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[24:25], s16 +; GFX9-NEXT: s_lshr_b64 s[26:27], s[24:25], s17 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 +; GFX9-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[24:25], s[24:25], s23 +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[24:25] +; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] +; GFX9-NEXT: s_sub_i32 s26, s22, 64 +; GFX9-NEXT: s_sub_i32 s24, 64, s22 +; GFX9-NEXT: s_cmp_lt_u32 s22, 64 +; GFX9-NEXT: s_cselect_b32 s27, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s22, 0 +; GFX9-NEXT: s_cselect_b32 s28, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s22 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s24 +; GFX9-NEXT: s_lshr_b64 s[22:23], s[8:9], s22 +; GFX9-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s26 +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[22:23], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s28, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s27, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] +; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX9-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[18:19], s[4:5], s31 +; GFX9-NEXT: s_lshl_b64 s[20:21], s[6:7], 1 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 +; GFX9-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s30 +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[18:19], s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX9-NEXT: s_sub_i32 s9, s10, 64 +; GFX9-NEXT: s_sub_i32 s11, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 +; GFX9-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] +; GFX9-NEXT: s_sub_i32 s18, s8, 64 +; GFX9-NEXT: s_sub_i32 s16, 64, s8 +; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b32 s19, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s20, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] +; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshr_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_movk_i32 s18, 0x7f +; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_sub_i32 s30, 1, 64 +; GFX10-NEXT: s_and_b64 s[22:23], s[16:17], s[18:19] +; GFX10-NEXT: s_andn2_b64 s[16:17], s[18:19], s[16:17] +; GFX10-NEXT: s_sub_i32 s31, 64, 1 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_mov_b32 s62, s10 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_mov_b32 s63, s11 +; GFX10-NEXT: s_cselect_b32 s23, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s31 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], 1 +; GFX10-NEXT: s_lshl_b64 s[28:29], s[0:1], 1 +; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s30 +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[26:27], s[28:29], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s23, 0 +; GFX10-NEXT: s_cselect_b64 s[46:47], s[2:3], s[0:1] +; GFX10-NEXT: s_sub_i32 s23, s16, 64 +; GFX10-NEXT: s_sub_i32 s2, 64, s16 +; GFX10-NEXT: s_cmp_lt_u32 s16, 64 +; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s16, 0 +; GFX10-NEXT: s_cselect_b32 s29, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[46:47], s16 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[26:27], s2 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[26:27], s16 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[24:25] +; GFX10-NEXT: s_lshl_b64 s[24:25], s[26:27], s23 +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[78:79], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[24:25] +; GFX10-NEXT: s_cmp_lg_u32 s29, 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[46:47], s[2:3] +; GFX10-NEXT: s_sub_i32 s26, s22, 64 +; GFX10-NEXT: s_sub_i32 s23, 64, s22 +; GFX10-NEXT: s_cmp_lt_u32 s22, 64 +; GFX10-NEXT: s_cselect_b32 s27, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s22, 0 +; GFX10-NEXT: s_cselect_b32 s28, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s22 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[62:63], s23 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[62:63], s22 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[62:63], s26 +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s27, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[78:79], s[0:1] +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] +; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], s[18:19] +; GFX10-NEXT: s_andn2_b64 s[10:11], s[18:19], s[20:21] +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: s_cselect_b32 s11, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s31 +; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], 1 +; GFX10-NEXT: s_lshl_b64 s[20:21], s[4:5], 1 +; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s30 +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 +; GFX10-NEXT: s_cselect_b64 s[18:19], s[20:21], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] +; GFX10-NEXT: s_sub_i32 s9, s10, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s10 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[18:19], s6 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[18:19], s10 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[16:17] +; GFX10-NEXT: s_lshl_b64 s[16:17], s[18:19], s9 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[16:17] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[4:5], s[6:7] +; GFX10-NEXT: s_sub_i32 s18, s8, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, s8 +; GFX10-NEXT: s_cmp_lt_u32 s8, 64 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: ; return to shader part epilog + %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) + ret <2 x i128> %result +} + +define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) { +; GFX6-LABEL: v_fshr_v2i128: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_sub_i32 s6, 64, 1 +; GFX6-NEXT: s_sub_i32 s7, 1, 64 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], s6 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], 1 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: v_or_b32_e32 v19, v17, v21 +; GFX6-NEXT: v_or_b32_e32 v21, v18, v22 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1 +; GFX6-NEXT: s_and_b32 s4, 1, s4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s5 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX6-NEXT: s_movk_i32 s8, 0x7f +; GFX6-NEXT: v_xor_b32_e32 v2, -1, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v21, vcc +; GFX6-NEXT: v_and_b32_e32 v19, s8, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[17:18], v2 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v19 +; GFX6-NEXT: v_and_b32_e32 v25, s8, v16 +; GFX6-NEXT: v_or_b32_e32 v23, v2, v21 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v25 +; GFX6-NEXT: v_or_b32_e32 v24, v3, v22 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 +; GFX6-NEXT: v_lshr_b64 v[21:22], v[8:9], v25 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_or_b32_e32 v21, v21, v2 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v19 +; GFX6-NEXT: v_or_b32_e32 v22, v22, v3 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[17:18], v2 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v24, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] +; GFX6-NEXT: v_subrev_i32_e64 v0, s[4:5], 64, v25 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[17:18], v19 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v0 +; GFX6-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v25 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX6-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX6-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX6-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX6-NEXT: v_and_b32_e32 v17, s8, v8 +; GFX6-NEXT: s_cmp_lt_u32 1, 64 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX6-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], s6 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], 1 +; GFX6-NEXT: s_cselect_b32 s4, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 1, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_and_b32 s4, 1, s4 +; GFX6-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], s7 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: s_and_b32 s4, 1, s5 +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v17 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[8:9], v6 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[4:5], v17 +; GFX6-NEXT: v_subrev_i32_e32 v18, vcc, 64, v17 +; GFX6-NEXT: v_or_b32_e32 v10, v6, v10 +; GFX6-NEXT: v_or_b32_e32 v11, v7, v11 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[8:9], v17 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[8:9], v18 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX6-NEXT: v_and_b32_e32 v16, s8, v20 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v6, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v7, v5, vcc +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v16 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v16 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 +; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 64, v16 +; GFX6-NEXT: v_or_b32_e32 v11, v4, v6 +; GFX6-NEXT: v_or_b32_e32 v17, v5, v7 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v16 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX6-NEXT: v_or_b32_e32 v4, v18, v6 +; GFX6-NEXT: v_or_b32_e32 v5, v19, v7 +; GFX6-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX6-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fshr_v2i128: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_sub_i32 s6, 64, 1 +; GFX8-NEXT: s_sub_i32 s7, 1, 64 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], s6, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], 1, v[2:3] +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: v_or_b32_e32 v19, v17, v21 +; GFX8-NEXT: v_or_b32_e32 v21, v18, v22 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1] +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX8-NEXT: s_movk_i32 s8, 0x7f +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v21, vcc +; GFX8-NEXT: v_and_b32_e32 v19, s8, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, v[17:18] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v25, s8, v16 +; GFX8-NEXT: v_or_b32_e32 v23, v2, v21 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v25 +; GFX8-NEXT: v_or_b32_e32 v24, v3, v22 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX8-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_or_b32_e32 v21, v21, v2 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v19 +; GFX8-NEXT: v_or_b32_e32 v22, v22, v3 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[17:18] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v24, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] +; GFX8-NEXT: v_subrev_u32_e64 v0, s[4:5], 64, v25 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX8-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX8-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX8-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX8-NEXT: v_and_b32_e32 v17, s8, v8 +; GFX8-NEXT: s_cmp_lt_u32 1, 64 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s6, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] +; GFX8-NEXT: s_cselect_b32 s4, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 1, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_and_b32 s4, 1, s4 +; GFX8-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], s7, v[4:5] +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: s_and_b32 s4, 1, s5 +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v17 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v6, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v17, v[4:5] +; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, 64, v17 +; GFX8-NEXT: v_or_b32_e32 v10, v6, v10 +; GFX8-NEXT: v_or_b32_e32 v11, v7, v11 +; GFX8-NEXT: v_lshlrev_b64 v[6:7], v17, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX8-NEXT: v_and_b32_e32 v16, s8, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v7, v5, vcc +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v16 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v16, v[12:13] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] +; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 64, v16 +; GFX8-NEXT: v_or_b32_e32 v11, v4, v6 +; GFX8-NEXT: v_or_b32_e32 v17, v5, v7 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v10, v[14:15] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v16, v[14:15] +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v18, v6 +; GFX8-NEXT: v_or_b32_e32 v5, v19, v7 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_sub_i32 s6, 64, 1 +; GFX9-NEXT: s_sub_i32 s7, 1, 64 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], s6, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], 1, v[2:3] +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: v_or_b32_e32 v19, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v21, v18, v22 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v19, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX9-NEXT: s_movk_i32 s8, 0x7f +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v21, vcc +; GFX9-NEXT: v_and_b32_e32 v19, s8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v18, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, v[17:18] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v25, s8, v16 +; GFX9-NEXT: v_or_b32_e32 v23, v2, v21 +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v25 +; GFX9-NEXT: v_or_b32_e32 v24, v3, v22 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], v25, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_or_b32_e32 v21, v21, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v19 +; GFX9-NEXT: v_or_b32_e32 v22, v22, v3 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[17:18] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v25 +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[17:18] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v18, v0, v21, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v1, v22, s[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v25, v[10:11] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v18, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[4:5] +; GFX9-NEXT: v_or_b32_e32 v0, v16, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v9, v19, v9, vcc +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v20 +; GFX9-NEXT: v_or_b32_e32 v1, v17, v9 +; GFX9-NEXT: v_and_b32_e32 v17, s8, v8 +; GFX9-NEXT: s_cmp_lt_u32 1, 64 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s6, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] +; GFX9-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 1, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_and_b32 s4, 1, s4 +; GFX9-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], s7, v[4:5] +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: s_and_b32 s4, 1, s5 +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v17 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v6, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v17, v[4:5] +; GFX9-NEXT: v_subrev_u32_e32 v18, 64, v17 +; GFX9-NEXT: v_or_b32_e32 v10, v6, v10 +; GFX9-NEXT: v_or_b32_e32 v11, v7, v11 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], v17, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], v18, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v17 +; GFX9-NEXT: v_and_b32_e32 v16, s8, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v6, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v7, v5, vcc +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v16, v[12:13] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] +; GFX9-NEXT: v_subrev_u32_e32 v10, 64, v16 +; GFX9-NEXT: v_or_b32_e32 v11, v4, v6 +; GFX9-NEXT: v_or_b32_e32 v17, v5, v7 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v10, v[14:15] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v16, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v13, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc +; GFX9-NEXT: v_or_b32_e32 v4, v18, v6 +; GFX9-NEXT: v_or_b32_e32 v5, v19, v7 +; GFX9-NEXT: v_or_b32_e32 v6, v8, v10 +; GFX9-NEXT: v_or_b32_e32 v7, v9, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fshr_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_sub_i32 s5, 64, 1 +; GFX10-NEXT: s_sub_i32 s6, 1, 64 +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[21:22], 1, v[2:3] +; GFX10-NEXT: s_cselect_b32 vcc_lo, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[23:24], 1, v[0:1] +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: s_and_b32 s4, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] +; GFX10-NEXT: v_xor_b32_e32 v19, -1, v16 +; GFX10-NEXT: v_or_b32_e32 v21, v27, v21 +; GFX10-NEXT: v_or_b32_e32 v18, v28, v22 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s4 +; GFX10-NEXT: s_movk_i32 s7, 0x7f +; GFX10-NEXT: s_and_b32 s8, 1, s8 +; GFX10-NEXT: v_and_b32_e32 v31, s7, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v17, 0, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v18, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v21, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e32 v18, 0, v24, vcc_lo +; GFX10-NEXT: v_sub_nc_u32_e32 v19, 64, v31 +; GFX10-NEXT: v_and_b32_e32 v26, s7, v16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v31 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v3, s4 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v19, v[17:18] +; GFX10-NEXT: v_mov_b32_e32 v35, v10 +; GFX10-NEXT: v_mov_b32_e32 v36, v11 +; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v26 +; GFX10-NEXT: v_lshlrev_b64 v[21:22], v31, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[23:24], v31, v[17:18] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v31 +; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v26 +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 +; GFX10-NEXT: v_lshrrev_b64 v[27:28], s5, v[4:5] +; GFX10-NEXT: v_or_b32_e32 v21, v2, v21 +; GFX10-NEXT: v_or_b32_e32 v22, v3, v22 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], v16, v[17:18] +; GFX10-NEXT: v_lshlrev_b64 v[18:19], v25, v[35:36] +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] +; GFX10-NEXT: v_cndmask_b32_e32 v23, 0, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v39, 0, v24, vcc_lo +; GFX10-NEXT: s_cmp_lt_u32 1, 64 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v21, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v22, v3, v22, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v29, v[35:36] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 +; GFX10-NEXT: v_or_b32_e32 v16, v16, v18 +; GFX10-NEXT: v_or_b32_e32 v17, v17, v19 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], 1, v[6:7] +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v18, v21, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v31, v22, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v16, s4 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 +; GFX10-NEXT: s_cmp_eq_u32 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v3, v17, s4 +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], 1, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[4:5], s6, v[4:5] +; GFX10-NEXT: s_and_b32 s6, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v21, v2, v8, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v25, -1, v20 +; GFX10-NEXT: v_or_b32_e32 v2, v27, v10 +; GFX10-NEXT: v_or_b32_e32 v3, v28, v11 +; GFX10-NEXT: v_cmp_ne_u32_e64 s6, 0, s6 +; GFX10-NEXT: s_and_b32 s8, 1, s8 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v26, v[35:36] +; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v16, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v16, v19, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v11, v4, v2, s6 +; GFX10-NEXT: v_and_b32_e32 v30, s7, v25 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v3, s6 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v17, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, v0, s4 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v30 +; GFX10-NEXT: v_or_b32_e32 v0, v23, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v11, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 +; GFX10-NEXT: v_lshrrev_b64 v[5:6], v2, v[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 64, v30 +; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v1, s4 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v30, v[3:4] +; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 +; GFX10-NEXT: v_or_b32_e32 v1, v39, v16 +; GFX10-NEXT: v_or_b32_e32 v2, v18, v19 +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v30, v[8:9] +; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] +; GFX10-NEXT: v_or_b32_e32 v10, v5, v10 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, 64, v23 +; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v30 +; GFX10-NEXT: v_lshlrev_b64 v[7:8], v7, v[8:9] +; GFX10-NEXT: v_or_b32_e32 v9, v6, v11 +; GFX10-NEXT: v_lshrrev_b64 v[34:35], v5, v[14:15] +; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v23 +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v16, v18, v20 +; GFX10-NEXT: v_or_b32_e32 v18, v19, v21 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v7, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[7:8], v23, v[14:15] +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v30 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v34, v16, s4 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v23 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v35, v18, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v15, v10, v3, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v4, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, v12, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, v13, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v7, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s4 +; GFX10-NEXT: v_or_b32_e32 v3, v31, v26 +; GFX10-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX10-NEXT: v_or_b32_e32 v5, v14, v5 +; GFX10-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX10-NEXT: v_or_b32_e32 v7, v9, v7 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) + ret <2 x i128> %result +} + +declare i7 @llvm.fshr.i7(i7, i7, i7) #0 +declare i8 @llvm.fshr.i8(i8, i8, i8) #0 +declare <2 x i8> @llvm.fshr.v2i8(<2 x i8>, <2 x i8>, <2 x i8>) #0 +declare <4 x i8> @llvm.fshr.v4i8(<4 x i8>, <4 x i8>, <4 x i8>) #0 + +declare i16 @llvm.fshr.i16(i16, i16, i16) #0 +declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) #0 +declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) #0 +declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) #0 +declare <5 x i16> @llvm.fshr.v5i16(<5 x i16>, <5 x i16>, <5 x i16>) #0 +declare <6 x i16> @llvm.fshr.v6i16(<6 x i16>, <6 x i16>, <6 x i16>) #0 +declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) #0 + +declare i24 @llvm.fshr.i24(i24, i24, i24) #0 +declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) #0 + +declare i32 @llvm.fshr.i32(i32, i32, i32) #0 +declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #0 +declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) #0 +declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #0 +declare <5 x i32> @llvm.fshr.v5i32(<5 x i32>, <5 x i32>, <5 x i32>) #0 +declare <16 x i32> @llvm.fshr.v16i32(<16 x i32>, <16 x i32>, <16 x i32>) #0 + +declare i48 @llvm.fshr.i48(i48, i48, i48) #0 + +declare i64 @llvm.fshr.i64(i64, i64, i64) #0 +declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) #0 + +declare i128 @llvm.fshr.i128(i128, i128, i128) #0 +declare <2 x i128> @llvm.fshr.v2i128(<2 x i128>, <2 x i128>, <2 x i128>) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir @@ -0,0 +1,1320 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=SI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s + +--- +name: test_fshl_s32_s32 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; SI-LABEL: name: test_fshl_s32_s32 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32) + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] + ; SI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) + ; SI: $vgpr0 = COPY [[FSHR1]](s32) + ; VI-LABEL: name: test_fshl_s32_s32 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32) + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] + ; VI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) + ; VI: $vgpr0 = COPY [[FSHR1]](s32) + ; GFX9-LABEL: name: test_fshl_s32_s32 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[COPY]], [[COPY1]], [[C]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]] + ; GFX9: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) + ; GFX9: $vgpr0 = COPY [[FSHR1]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = G_FSHL %0, %1, %2 + $vgpr0 = COPY %3 +... + +--- +name: test_fshl_v2s32_v2s32 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; SI-LABEL: name: test_fshl_v2s32_v2s32 + ; SI: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; SI: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; SI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; SI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32) + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] + ; SI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) + ; SI: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; SI: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] + ; SI: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32) + ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32) + ; SI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; VI-LABEL: name: test_fshl_v2s32_v2s32 + ; VI: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; VI: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; VI: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; VI: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32) + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] + ; VI: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) + ; VI: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; VI: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] + ; VI: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32) + ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32) + ; VI: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; GFX9-LABEL: name: test_fshl_v2s32_v2s32 + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX9: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[FSHR:%[0-9]+]]:_(s32) = G_FSHR [[UV]], [[UV2]], [[C]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[UV4]], [[C1]] + ; GFX9: [[FSHR1:%[0-9]+]]:_(s32) = G_FSHR [[LSHR]], [[FSHR]], [[XOR]](s32) + ; GFX9: [[FSHR2:%[0-9]+]]:_(s32) = G_FSHR [[UV1]], [[UV3]], [[C]](s32) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX9: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[UV5]], [[C1]] + ; GFX9: [[FSHR3:%[0-9]+]]:_(s32) = G_FSHR [[LSHR1]], [[FSHR2]], [[XOR1]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSHR1]](s32), [[FSHR3]](s32) + ; GFX9: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 + %2:_(<2 x s32>) = COPY $vgpr4_vgpr5 + %3:_(<2 x s32>) = G_FSHL %0, %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: test_fshl_s16_s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; SI-LABEL: name: test_fshl_s16_s16 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC1]], [[TRUNC2]] + ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; SI: $vgpr0 = COPY [[ANYEXT]](s32) + ; VI-LABEL: name: test_fshl_s16_s16 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]] + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16) + ; VI: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[LSHR]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR1]] + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; VI: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-LABEL: name: test_fshl_s16_s16 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]] + ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C2]](s16) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[LSHR]], [[AND1]](s16) + ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR1]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s16) = G_TRUNC %0 + %4:_(s16) = G_TRUNC %1 + %5:_(s16) = G_TRUNC %2 + %6:_(s16) = G_FSHL %3, %4, %5 + %7:_(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: test_fshl_v2s16_v2s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; SI-LABEL: name: test_fshl_v2s16_v2s16 + ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]] + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C4]] + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]] + ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]] + ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32) + ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL2]] + ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) + ; VI-LABEL: name: test_fshl_v2s16_v2s16 + ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C2]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C3]](s16) + ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[LSHR3]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR4]] + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C1]] + ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C2]] + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) + ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C3]](s16) + ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[LSHR5]], [[AND3]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR6]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: $vgpr0 = COPY [[BITCAST3]](<2 x s16>) + ; GFX9-LABEL: name: test_fshl_v2s16_v2s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) + ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[AND]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR]], [[AND1]](<2 x s16>) + ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR1]] + ; GFX9: $vgpr0 = COPY [[OR]](<2 x s16>) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = COPY $vgpr2 + %3:_(<2 x s16>) = G_FSHL %0, %1, %2 + $vgpr0 = COPY %3 +... + +--- +name: test_fshl_s64_s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; SI-LABEL: name: test_fshl_s64_s64 + ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; SI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 + ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; SI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]] + ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; SI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]] + ; SI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]] + ; SI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32) + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; SI: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[LSHR]], [[TRUNC1]](s32) + ; SI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[LSHR1]] + ; SI: $vgpr0_vgpr1 = COPY [[OR]](s64) + ; VI-LABEL: name: test_fshl_s64_s64 + ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; VI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 + ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; VI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; VI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]] + ; VI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]] + ; VI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32) + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; VI: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[LSHR]], [[TRUNC1]](s32) + ; VI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[LSHR1]] + ; VI: $vgpr0_vgpr1 = COPY [[OR]](s64) + ; GFX9-LABEL: name: test_fshl_s64_s64 + ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 + ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; GFX9: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]] + ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; GFX9: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]] + ; GFX9: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]] + ; GFX9: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C2]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[LSHR]], [[TRUNC1]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[LSHR1]] + ; GFX9: $vgpr0_vgpr1 = COPY [[OR]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = COPY $vgpr2_vgpr3 + %2:_(s64) = COPY $vgpr4_vgpr5 + %3:_(s64) = G_FSHL %0, %1, %2 + $vgpr0_vgpr1 = COPY %3 +... + +--- +name: test_fshl_s8_s8 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; SI-LABEL: name: test_fshl_s8_s8 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]] + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND2]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[COPY16]] + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; SI: $vgpr0 = COPY [[COPY17]](s32) + ; VI-LABEL: name: test_fshl_s8_s8 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]] + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32) + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32) + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32) + ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[TRUNC3]](s16) + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; VI: [[COPY12:%[0-9]+]]:_(s16) = COPY [[LSHR]](s16) + ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[COPY12]], [[C4]] + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND5]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND6]], [[TRUNC4]](s16) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16) + ; VI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]] + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; VI: $vgpr0 = COPY [[COPY13]](s32) + ; GFX9-LABEL: name: test_fshl_s8_s8 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]] + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[TRUNC3]](s16) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s16) = COPY [[LSHR]](s16) + ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[COPY12]], [[C4]] + ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND5]](s32) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND6]], [[TRUNC4]](s16) + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16) + ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]] + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[COPY13]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s8) = G_TRUNC %0 + %4:_(s8) = G_TRUNC %1 + %5:_(s8) = G_TRUNC %2 + %6:_(s8) = G_FSHL %3, %4, %5 + %7:_(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: test_fshl_s24_s24 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; SI-LABEL: name: test_fshl_s24_s24 + ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; SI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; SI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]] + ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; SI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]] + ; SI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; SI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; SI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; SI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; SI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; SI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; SI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; SI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] + ; SI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] + ; SI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; SI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]] + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; SI: $vgpr0 = COPY [[COPY15]](s32) + ; VI-LABEL: name: test_fshl_s24_s24 + ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; VI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]] + ; VI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; VI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]] + ; VI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; VI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; VI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; VI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; VI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; VI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; VI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; VI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] + ; VI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] + ; VI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; VI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]] + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32) + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32) + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]] + ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; VI: $vgpr0 = COPY [[COPY15]](s32) + ; GFX9-LABEL: name: test_fshl_s24_s24 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]] + ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]] + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] + ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] + ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C2]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[AND5]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]] + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[COPY15]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s24) = G_TRUNC %0 + %4:_(s24) = G_TRUNC %1 + %5:_(s24) = G_TRUNC %2 + %6:_(s24) = G_FSHL %3, %4, %5 + %7:_(s32) = G_ANYEXT %6 + $vgpr0 = COPY %7 +... + +--- +name: test_fshl_v3s16_v3s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; SI-LABEL: name: test_fshl_v3s16_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; SI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; SI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; SI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; SI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC3]], [[TRUNC4]] + ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]] + ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[ZEXT2]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C4]] + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY10]](s32) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]] + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32) + ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC5]], [[TRUNC6]] + ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; SI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]] + ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[ZEXT4]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C4]] + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY14]](s32) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]] + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32) + ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC7]], [[TRUNC8]] + ; SI: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL3]] + ; SI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]] + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL4]] + ; SI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; SI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C4]] + ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C4]] + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL5]] + ; SI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI: $vgpr0 = COPY [[BITCAST8]](<2 x s16>) + ; SI: $vgpr1 = COPY [[BITCAST9]](<2 x s16>) + ; SI: $vgpr2 = COPY [[BITCAST10]](<2 x s16>) + ; VI-LABEL: name: test_fshl_v3s16_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; VI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; VI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; VI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; VI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C2]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C3]](s16) + ; VI: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[LSHR6]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR7]] + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C1]] + ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C2]] + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) + ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C3]](s16) + ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND3]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR9]] + ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C1]] + ; VI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C2]] + ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND4]](s16) + ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C3]](s16) + ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND5]](s16) + ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[SHL2]], [[LSHR11]] + ; VI: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; VI: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32) + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C4]] + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]] + ; VI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C4]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL5]] + ; VI: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI: $vgpr0 = COPY [[BITCAST8]](<2 x s16>) + ; VI: $vgpr1 = COPY [[BITCAST9]](<2 x s16>) + ; VI: $vgpr2 = COPY [[BITCAST10]](<2 x s16>) + ; GFX9-LABEL: name: test_fshl_v3s16_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]] + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY16]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]] + ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY16]] + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY16]] + ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY16]] + ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY16]] + ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY16]] + ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY16]] + ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C1]] + ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[C]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32) + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C4]] + ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32) + ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[C]] + ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB4]], [[FPTOUI1]] + ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]] + ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]] + ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND1]], [[ADD1]] + ; GFX9: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[C]] + ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[MUL3]] + ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[C]] + ; GFX9: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[C]] + ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB6]], [[SUB5]] + ; GFX9: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[C]] + ; GFX9: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[C]] + ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]] + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[SUB8:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC5]] + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC2]], [[SUB8]](<2 x s16>) + ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR6]] + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C1]] + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[COPY23]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG2:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP2]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG2]], [[C5]] + ; GFX9: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL2]](s32) + ; GFX9: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY23]] + ; GFX9: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[SUB9]], [[FPTOUI2]] + ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL4]] + ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI2]], [[UMULH4]] + ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD2]] + ; GFX9: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[COPY23]] + ; GFX9: [[SUB10:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL5]] + ; GFX9: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB10]](s32), [[COPY23]] + ; GFX9: [[SUB11:%[0-9]+]]:_(s32) = G_SUB [[SUB10]], [[COPY23]] + ; GFX9: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[SUB11]], [[SUB10]] + ; GFX9: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[COPY23]] + ; GFX9: [[SUB12:%[0-9]+]]:_(s32) = G_SUB [[SELECT4]], [[COPY23]] + ; GFX9: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SUB12]], [[SELECT4]] + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[COPY25]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG3:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP3]](s32) + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG3]], [[C6]] + ; GFX9: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL3]](s32) + ; GFX9: [[SUB13:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY25]] + ; GFX9: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[SUB13]], [[FPTOUI3]] + ; GFX9: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL6]] + ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[UMULH6]] + ; GFX9: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[COPY24]], [[ADD3]] + ; GFX9: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[UMULH7]], [[COPY25]] + ; GFX9: [[SUB14:%[0-9]+]]:_(s32) = G_SUB [[COPY24]], [[MUL7]] + ; GFX9: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB14]](s32), [[COPY25]] + ; GFX9: [[SUB15:%[0-9]+]]:_(s32) = G_SUB [[SUB14]], [[COPY25]] + ; GFX9: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[SUB15]], [[SUB14]] + ; GFX9: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT6]](s32), [[COPY25]] + ; GFX9: [[SUB16:%[0-9]+]]:_(s32) = G_SUB [[SELECT6]], [[COPY25]] + ; GFX9: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[SUB16]], [[SELECT6]] + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[SELECT5]](s32) + ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[COPY27]](s32) + ; GFX9: [[SUB17:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC6]], [[BUILD_VECTOR_TRUNC7]] + ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC3]], [[SUB17]](<2 x s16>) + ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR7]] + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](<2 x s16>) + ; GFX9: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[OR1]](<2 x s16>) + ; GFX9: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) + ; GFX9: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; GFX9: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; GFX9: [[COPY28:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY28]](s32), [[COPY29]](s32) + ; GFX9: [[COPY30:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY30]](s32), [[COPY31]](s32) + ; GFX9: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; GFX9: [[COPY33:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY32]](s32), [[COPY33]](s32) + ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC9]](<2 x s16>) + ; GFX9: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC10]](<2 x s16>) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = COPY $vgpr2 + %3:_(<2 x s16>) = COPY $vgpr3 + %4:_(<2 x s16>) = COPY $vgpr4 + %5:_(<2 x s16>) = COPY $vgpr5 + %6:_(<2 x s16>) = G_IMPLICIT_DEF + %7:_(<6 x s16>) = G_CONCAT_VECTORS %0(<2 x s16>), %1(<2 x s16>), %6(<2 x s16>) + %8:_(<3 x s16>), %9:_(<3 x s16>) = G_UNMERGE_VALUES %7(<6 x s16>) + %10:_(<6 x s16>) = G_CONCAT_VECTORS %2(<2 x s16>), %3(<2 x s16>), %6(<2 x s16>) + %11:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %10(<6 x s16>) + %13:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %6(<2 x s16>) + %14:_(<3 x s16>), %15:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>) + %16:_(<3 x s16>) = G_FSHL %8, %11, %14(<3 x s16>) + %17:_(<3 x s16>) = G_IMPLICIT_DEF + %18:_(<6 x s16>) = G_CONCAT_VECTORS %16(<3 x s16>), %17(<3 x s16>) + %19:_(<2 x s16>), %20:_(<2 x s16>), %21:_(<2 x s16>) = G_UNMERGE_VALUES %18(<6 x s16>) + $vgpr0 = COPY %19(<2 x s16>) + $vgpr1 = COPY %20(<2 x s16>) + $vgpr2 = COPY %21(<2 x s16>) +... + +--- +name: test_fshl_v4s16_v4s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; SI-LABEL: name: test_fshl_v4s16_v4s16 + ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; SI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]] + ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C2]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32) + ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C4]] + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C4]] + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]] + ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]] + ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]] + ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32) + ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C4]] + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C4]] + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]] + ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]] + ; SI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]] + ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[ZEXT4]](s32) + ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C4]] + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY11]](s32) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C4]] + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]] + ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]] + ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC3]], [[C2]] + ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[ZEXT6]](s32) + ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C4]] + ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY15]](s32) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16) + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C4]] + ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL4]] + ; SI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL5]] + ; SI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) + ; SI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; VI-LABEL: name: test_fshl_v4s16_v4s16 + ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; VI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>) + ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>) + ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C1]] + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC8]], [[C2]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C1]] + ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C3]](s16) + ; VI: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[LSHR6]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR7]] + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C1]] + ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC9]], [[C2]] + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) + ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C3]](s16) + ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND3]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR9]] + ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C1]] + ; VI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC10]], [[C2]] + ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]] + ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND4]](s16) + ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC6]], [[C3]](s16) + ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND5]](s16) + ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[SHL2]], [[LSHR11]] + ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C1]] + ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC11]], [[C2]] + ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[AND6]](s16) + ; VI: [[LSHR12:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC7]], [[C3]](s16) + ; VI: [[LSHR13:%[0-9]+]]:_(s16) = G_LSHR [[LSHR12]], [[AND7]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL3]], [[LSHR13]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; VI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32) + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>) + ; VI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-LABEL: name: test_fshl_v4s16_v4s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) + ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[AND]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR]], [[AND1]](<2 x s16>) + ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR1]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR_TRUNC4]] + ; GFX9: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[AND2]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[LSHR2]], [[AND3]](<2 x s16>) + ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR3]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[OR]](<2 x s16>), [[OR1]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 + %2:_(<4 x s16>) = COPY $vgpr4_vgpr5 + %3:_(<4 x s16>) = G_FSHL %0, %1, %2 + $vgpr0_vgpr1 = COPY %3 +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir @@ -90,11 +90,27 @@ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; SI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s16) - ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s16) + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]] + ; SI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C1]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C2]](s32) + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[ZEXT]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[ZEXT1]](s32) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC1]], [[TRUNC2]] + ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; SI: $vgpr0 = COPY [[ANYEXT]](s32) ; VI-LABEL: name: test_fshr_s16_s16 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -103,8 +119,17 @@ ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; VI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s16) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s16) + ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]] + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16) + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16) + ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[AND]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR]] + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; VI: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-LABEL: name: test_fshr_s16_s16 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 @@ -113,8 +138,17 @@ ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s16) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s16) + ; GFX9: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]] + ; GFX9: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; GFX9: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C1]] + ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C]] + ; GFX9: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16) + ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND1]](s16) + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[AND]](s16) + ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR]] + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -137,35 +171,194 @@ ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; SI: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>) - ; SI: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>) - ; SI: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](<2 x s16>) - ; SI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[UV]], [[UV2]], [[UV4]](s16) - ; SI: [[FSHR1:%[0-9]+]]:_(s16) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s16) - ; SI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSHR]](s16), [[FSHR1]](s16) - ; SI: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; SI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32) + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]] + ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]] + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[COPY10]](s32) + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[COPY12]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL4]] + ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]] + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) + ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]] + ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]] + ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) + ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32) + ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[SHL2]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C5]] + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY16]](s32) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) + ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C5]] + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]] + ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]] + ; SI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]] + ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16) + ; SI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32) + ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[SHL3]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C5]] + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY19]](s32) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16) + ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C5]] + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]] + ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI: $vgpr0 = COPY [[BITCAST5]](<2 x s16>) ; VI-LABEL: name: test_fshr_v2s16_v2s16 ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; VI: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>) - ; VI: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>) - ; VI: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](<2 x s16>) - ; VI: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[UV]], [[UV2]], [[UV4]](s16) - ; VI: [[FSHR1:%[0-9]+]]:_(s16) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s16) - ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSHR]](s16), [[FSHR1]](s16) - ; VI: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16) + ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR3]] + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) + ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16) + ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR5]] + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16) + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]] + ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BITCAST3]] + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) + ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]] + ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]] + ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] + ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16) + ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16) + ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL5]], [[LSHR9]] + ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]] + ; VI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]] + ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16) + ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16) + ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[SHL6]], [[LSHR11]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]] + ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI: $vgpr0 = COPY [[BITCAST5]](<2 x s16>) ; GFX9-LABEL: name: test_fshr_v2s16_v2s16 ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>) - ; GFX9: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>) - ; GFX9: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](<2 x s16>) - ; GFX9: [[FSHR:%[0-9]+]]:_(s16) = G_FSHR [[UV]], [[UV2]], [[UV4]](s16) - ; GFX9: [[FSHR1:%[0-9]+]]:_(s16) = G_FSHR [[UV1]], [[UV3]], [[UV5]](s16) - ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FSHR]](s16), [[FSHR1]](s16) - ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY2]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) + ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY2]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[COPY1]], [[AND]](<2 x s16>) + ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR]] + ; GFX9: $vgpr0 = COPY [[OR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = COPY $vgpr2 @@ -183,20 +376,53 @@ ; SI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; SI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 ; SI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 - ; SI: [[FSHR:%[0-9]+]]:_(s64) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s64) - ; SI: $vgpr0_vgpr1 = COPY [[FSHR]](s64) + ; SI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; SI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]] + ; SI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; SI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]] + ; SI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C2]](s32) + ; SI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; SI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SHL]], [[TRUNC]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; SI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[TRUNC1]](s32) + ; SI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]] + ; SI: $vgpr0_vgpr1 = COPY [[OR]](s64) ; VI-LABEL: name: test_fshr_s64_s64 ; VI: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; VI: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 ; VI: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 - ; VI: [[FSHR:%[0-9]+]]:_(s64) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s64) - ; VI: $vgpr0_vgpr1 = COPY [[FSHR]](s64) + ; VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; VI: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]] + ; VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; VI: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]] + ; VI: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C2]](s32) + ; VI: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; VI: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SHL]], [[TRUNC]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; VI: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[TRUNC1]](s32) + ; VI: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]] + ; VI: $vgpr0_vgpr1 = COPY [[OR]](s64) ; GFX9-LABEL: name: test_fshr_s64_s64 ; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 ; GFX9: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5 - ; GFX9: [[FSHR:%[0-9]+]]:_(s64) = G_FSHR [[COPY]], [[COPY1]], [[COPY2]](s64) - ; GFX9: $vgpr0_vgpr1 = COPY [[FSHR]](s64) + ; GFX9: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63 + ; GFX9: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[C]] + ; GFX9: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; GFX9: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY2]], [[C1]] + ; GFX9: [[AND1:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C2]](s32) + ; GFX9: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; GFX9: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SHL]], [[TRUNC]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) + ; GFX9: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[TRUNC1]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]] + ; GFX9: $vgpr0_vgpr1 = COPY [[OR]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = COPY $vgpr4_vgpr5 @@ -214,32 +440,115 @@ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; SI: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) - ; SI: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) - ; SI: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) - ; SI: [[FSHR:%[0-9]+]]:_(s8) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s8) - ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s8) - ; SI: $vgpr0 = COPY [[ANYEXT]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]] + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]] + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND2]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[AND3]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]] + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C3]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY15]], [[COPY16]] + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; SI: $vgpr0 = COPY [[COPY17]](s32) ; VI-LABEL: name: test_fshr_s8_s8 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; VI: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) - ; VI: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) - ; VI: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) - ; VI: [[FSHR:%[0-9]+]]:_(s8) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s8) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s8) - ; VI: $vgpr0 = COPY [[ANYEXT]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]] + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; VI: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32) + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]] + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32) + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; VI: [[COPY11:%[0-9]+]]:_(s16) = COPY [[SHL]](s16) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32) + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[COPY11]], [[TRUNC2]](s16) + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; VI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND4]](s32) + ; VI: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[TRUNC4]](s16) + ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16) + ; VI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]] + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; VI: $vgpr0 = COPY [[COPY13]](s32) ; GFX9-LABEL: name: test_fshr_s8_s8 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) - ; GFX9: [[FSHR:%[0-9]+]]:_(s8) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s8) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s8) - ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY4]] + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY5]], [[COPY6]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[XOR]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[COPY8]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; GFX9: [[COPY11:%[0-9]+]]:_(s16) = COPY [[SHL]](s16) + ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[AND3]](s32) + ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[COPY11]], [[TRUNC2]](s16) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]] + ; GFX9: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[AND4]](s32) + ; GFX9: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[TRUNC4]](s16) + ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16) + ; GFX9: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]] + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[COPY13]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -261,32 +570,158 @@ ; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; SI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; SI: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32) - ; SI: [[TRUNC1:%[0-9]+]]:_(s24) = G_TRUNC [[COPY1]](s32) - ; SI: [[TRUNC2:%[0-9]+]]:_(s24) = G_TRUNC [[COPY2]](s32) - ; SI: [[FSHR:%[0-9]+]]:_(s24) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s24) - ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s24) - ; SI: $vgpr0 = COPY [[ANYEXT]](s32) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; SI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; SI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; SI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; SI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; SI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; SI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]] + ; SI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; SI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]] + ; SI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; SI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; SI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; SI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; SI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; SI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; SI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; SI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] + ; SI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] + ; SI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; SI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]] + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND3]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; SI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; SI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]] + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; SI: $vgpr0 = COPY [[COPY15]](s32) ; VI-LABEL: name: test_fshr_s24_s24 ; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; VI: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32) - ; VI: [[TRUNC1:%[0-9]+]]:_(s24) = G_TRUNC [[COPY1]](s32) - ; VI: [[TRUNC2:%[0-9]+]]:_(s24) = G_TRUNC [[COPY2]](s32) - ; VI: [[FSHR:%[0-9]+]]:_(s24) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s24) - ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s24) - ; VI: $vgpr0 = COPY [[ANYEXT]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; VI: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; VI: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; VI: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]] + ; VI: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; VI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]] + ; VI: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; VI: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; VI: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; VI: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; VI: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; VI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; VI: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; VI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; VI: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; VI: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] + ; VI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] + ; VI: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; VI: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32) + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32) + ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND3]](s32) + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] + ; VI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32) + ; VI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32) + ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]] + ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; VI: $vgpr0 = COPY [[COPY15]](s32) ; GFX9-LABEL: name: test_fshr_s24_s24 ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9: [[TRUNC:%[0-9]+]]:_(s24) = G_TRUNC [[COPY]](s32) - ; GFX9: [[TRUNC1:%[0-9]+]]:_(s24) = G_TRUNC [[COPY1]](s32) - ; GFX9: [[TRUNC2:%[0-9]+]]:_(s24) = G_TRUNC [[COPY2]](s32) - ; GFX9: [[FSHR:%[0-9]+]]:_(s24) = G_FSHR [[TRUNC]], [[TRUNC1]], [[TRUNC2]](s24) - ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSHR]](s24) - ; GFX9: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]] + ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[AND1]] + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]] + ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]] + ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY5]], [[COPY6]] + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[AND2]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[SUB4]](s32) + ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C2]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[SHL]](s32) + ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[AND3]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C2]] + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C2]] + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[SHL1]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]] + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR]](s32) + ; GFX9: $vgpr0 = COPY [[COPY15]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -297,3 +732,841 @@ %7:_(s32) = G_ANYEXT %6 $vgpr0 = COPY %7 ... + +--- +name: test_fshr_v3s16_v3s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; SI-LABEL: name: test_fshr_v3s16_v3s16 + ; SI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; SI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; SI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; SI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; SI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; SI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; SI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; SI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; SI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; SI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; SI: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY6]], [[COPY7]] + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[COPY8]] + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SUB1]](s32) + ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] + ; SI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[TRUNC]], [[C3]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[ZEXT]](s32) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]] + ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]] + ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C3]] + ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[ZEXT2]](s32) + ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY14]], [[C5]] + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY13]](s32) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C5]] + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL2]] + ; SI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; SI: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY16]], [[COPY17]] + ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SUB2]](s32) + ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; SI: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[COPY18]], [[DEF1]] + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SUB3]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]] + ; SI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]] + ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C2]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) + ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[ZEXT6]](s32) + ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32) + ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C5]] + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY20]](s32) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) + ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C5]] + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT7]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]] + ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]] + ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]] + ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16) + ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY23]], [[ZEXT8]](s32) + ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32) + ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY25]], [[COPY24]](s32) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16) + ; SI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY26]], [[C5]] + ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[ZEXT9]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT10]], [[SHL5]] + ; SI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST6]](<2 x s16>) + ; SI: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; SI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST7]](<2 x s16>) + ; SI: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) + ; SI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; SI: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; SI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C5]] + ; SI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32) + ; SI: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C5]] + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND16]], [[C]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND15]], [[SHL6]] + ; SI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; SI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; SI: [[AND17:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C5]] + ; SI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; SI: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY30]], [[C5]] + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND17]], [[SHL7]] + ; SI: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; SI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32) + ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY31]], [[C5]] + ; SI: [[COPY32:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; SI: [[AND20:%[0-9]+]]:_(s32) = G_AND [[COPY32]], [[C5]] + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND20]], [[C]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND19]], [[SHL8]] + ; SI: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; SI: $vgpr0 = COPY [[BITCAST12]](<2 x s16>) + ; SI: $vgpr1 = COPY [[BITCAST13]](<2 x s16>) + ; SI: $vgpr2 = COPY [[BITCAST14]](<2 x s16>) + ; VI-LABEL: name: test_fshr_v3s16_v3s16 + ; VI: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; VI: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; VI: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; VI: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; VI: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; VI: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; VI: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; VI: [[DEF1:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; VI: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; VI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32) + ; VI: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32) + ; VI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; VI: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0 + ; VI: [[SUB:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[TRUNC6]] + ; VI: [[SUB1:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[TRUNC7]] + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[SUB]], [[C2]] + ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[SUB]], [[C3]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] + ; VI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR6:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C4]](s16) + ; VI: [[LSHR7:%[0-9]+]]:_(s16) = G_LSHR [[LSHR6]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR7]] + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[SUB1]], [[C2]] + ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[SUB1]], [[C3]] + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) + ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C4]](s16) + ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND3]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR9]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; VI: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[TRUNC8]] + ; VI: [[SUB3:%[0-9]+]]:_(s16) = G_SUB [[C1]], [[DEF1]] + ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SUB2]], [[C2]] + ; VI: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[SUB2]], [[C3]] + ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C2]] + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC2]], [[AND4]](s16) + ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC5]], [[C4]](s16) + ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND5]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL3]], [[LSHR11]] + ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[SUB3]], [[C2]] + ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[SUB3]], [[C3]] + ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] + ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[DEF1]], [[AND6]](s16) + ; VI: [[LSHR12:%[0-9]+]]:_(s16) = G_LSHR [[DEF1]], [[C4]](s16) + ; VI: [[LSHR13:%[0-9]+]]:_(s16) = G_LSHR [[LSHR12]], [[AND7]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[SHL4]], [[LSHR13]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]] + ; VI: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST6]](<2 x s16>) + ; VI: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; VI: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST7]](<2 x s16>) + ; VI: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) + ; VI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32) + ; VI: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32) + ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C5]] + ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32) + ; VI: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C5]] + ; VI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32) + ; VI: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]] + ; VI: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32) + ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; VI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[BITCAST10]](s32) + ; VI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32) + ; VI: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL7]] + ; VI: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32) + ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32) + ; VI: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]] + ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST11]](s32) + ; VI: [[AND13:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C5]] + ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL8]] + ; VI: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; VI: $vgpr0 = COPY [[BITCAST12]](<2 x s16>) + ; VI: $vgpr1 = COPY [[BITCAST13]](<2 x s16>) + ; VI: $vgpr2 = COPY [[BITCAST14]](<2 x s16>) + ; GFX9-LABEL: name: test_fshr_v3s16_v3s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX9: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[DEF1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>) + ; GFX9: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX9: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY3]](<2 x s16>) + ; GFX9: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[BITCAST3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[DEF1]](s32) + ; GFX9: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[COPY4]](<2 x s16>) + ; GFX9: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32) + ; GFX9: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[COPY5]](<2 x s16>) + ; GFX9: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY13]](s32), [[COPY14]](s32) + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[BITCAST4]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C1]] + ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY16]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]] + ; GFX9: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX9: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY16]] + ; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; GFX9: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY16]] + ; GFX9: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; GFX9: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY16]] + ; GFX9: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY16]] + ; GFX9: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; GFX9: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY16]] + ; GFX9: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY16]] + ; GFX9: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]] + ; GFX9: [[COPY17:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C1]] + ; GFX9: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[C]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32) + ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C4]] + ; GFX9: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32) + ; GFX9: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[C]] + ; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB4]], [[FPTOUI1]] + ; GFX9: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]] + ; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]] + ; GFX9: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND1]], [[ADD1]] + ; GFX9: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[C]] + ; GFX9: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[AND1]], [[MUL3]] + ; GFX9: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[C]] + ; GFX9: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[C]] + ; GFX9: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB6]], [[SUB5]] + ; GFX9: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[C]] + ; GFX9: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[C]] + ; GFX9: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]] + ; GFX9: [[COPY18:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32) + ; GFX9: [[COPY19:%[0-9]+]]:_(s32) = COPY [[SELECT3]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY18]](s32), [[COPY19]](s32) + ; GFX9: [[SUB8:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC4]], [[BUILD_VECTOR_TRUNC5]] + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC]], [[SUB8]](<2 x s16>) + ; GFX9: [[LSHR6:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC2]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL]], [[LSHR6]] + ; GFX9: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC6:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY20]](s32), [[COPY21]](s32) + ; GFX9: [[COPY22:%[0-9]+]]:_(s32) = COPY [[BITCAST5]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C1]] + ; GFX9: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[UITOFP2:%[0-9]+]]:_(s32) = G_UITOFP [[COPY23]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG2:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP2]](s32) + ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG2]], [[C5]] + ; GFX9: [[FPTOUI2:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL2]](s32) + ; GFX9: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY23]] + ; GFX9: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[SUB9]], [[FPTOUI2]] + ; GFX9: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL4]] + ; GFX9: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI2]], [[UMULH4]] + ; GFX9: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD2]] + ; GFX9: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[UMULH5]], [[COPY23]] + ; GFX9: [[SUB10:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL5]] + ; GFX9: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB10]](s32), [[COPY23]] + ; GFX9: [[SUB11:%[0-9]+]]:_(s32) = G_SUB [[SUB10]], [[COPY23]] + ; GFX9: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[SUB11]], [[SUB10]] + ; GFX9: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[COPY23]] + ; GFX9: [[SUB12:%[0-9]+]]:_(s32) = G_SUB [[SELECT4]], [[COPY23]] + ; GFX9: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SUB12]], [[SELECT4]] + ; GFX9: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C3]](s32) + ; GFX9: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[UITOFP3:%[0-9]+]]:_(s32) = G_UITOFP [[COPY25]](s32) + ; GFX9: [[AMDGPU_RCP_IFLAG3:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP3]](s32) + ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX9: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG3]], [[C6]] + ; GFX9: [[FPTOUI3:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL3]](s32) + ; GFX9: [[SUB13:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY25]] + ; GFX9: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[SUB13]], [[FPTOUI3]] + ; GFX9: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL6]] + ; GFX9: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[UMULH6]] + ; GFX9: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[COPY24]], [[ADD3]] + ; GFX9: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[UMULH7]], [[COPY25]] + ; GFX9: [[SUB14:%[0-9]+]]:_(s32) = G_SUB [[COPY24]], [[MUL7]] + ; GFX9: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB14]](s32), [[COPY25]] + ; GFX9: [[SUB15:%[0-9]+]]:_(s32) = G_SUB [[SUB14]], [[COPY25]] + ; GFX9: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[SUB15]], [[SUB14]] + ; GFX9: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT6]](s32), [[COPY25]] + ; GFX9: [[SUB16:%[0-9]+]]:_(s32) = G_SUB [[SELECT6]], [[COPY25]] + ; GFX9: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[SUB16]], [[SELECT6]] + ; GFX9: [[COPY26:%[0-9]+]]:_(s32) = COPY [[SELECT5]](s32) + ; GFX9: [[COPY27:%[0-9]+]]:_(s32) = COPY [[SELECT7]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY26]](s32), [[COPY27]](s32) + ; GFX9: [[SUB17:%[0-9]+]]:_(<2 x s16>) = G_SUB [[BUILD_VECTOR_TRUNC6]], [[BUILD_VECTOR_TRUNC7]] + ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[BUILD_VECTOR_TRUNC1]], [[SUB17]](<2 x s16>) + ; GFX9: [[LSHR7:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[BUILD_VECTOR_TRUNC3]], [[BUILD_VECTOR_TRUNC7]](<2 x s16>) + ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR7]] + ; GFX9: [[DEF2:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[DEF3:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[OR]](<2 x s16>) + ; GFX9: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C]](s32) + ; GFX9: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[OR1]](<2 x s16>) + ; GFX9: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C]](s32) + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<4 x s16>) + ; GFX9: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C]](s32) + ; GFX9: [[BITCAST9:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST9]], [[C]](s32) + ; GFX9: [[COPY28:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; GFX9: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC8:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY28]](s32), [[COPY29]](s32) + ; GFX9: [[COPY30:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; GFX9: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC9:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY30]](s32), [[COPY31]](s32) + ; GFX9: [[COPY32:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; GFX9: [[COPY33:%[0-9]+]]:_(s32) = COPY [[BITCAST9]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC10:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY32]](s32), [[COPY33]](s32) + ; GFX9: $vgpr0 = COPY [[BUILD_VECTOR_TRUNC8]](<2 x s16>) + ; GFX9: $vgpr1 = COPY [[BUILD_VECTOR_TRUNC9]](<2 x s16>) + ; GFX9: $vgpr2 = COPY [[BUILD_VECTOR_TRUNC10]](<2 x s16>) + %0:_(<2 x s16>) = COPY $vgpr0 + %1:_(<2 x s16>) = COPY $vgpr1 + %2:_(<2 x s16>) = COPY $vgpr2 + %3:_(<2 x s16>) = COPY $vgpr3 + %4:_(<2 x s16>) = COPY $vgpr4 + %5:_(<2 x s16>) = COPY $vgpr5 + %6:_(<2 x s16>) = G_IMPLICIT_DEF + %7:_(<6 x s16>) = G_CONCAT_VECTORS %0(<2 x s16>), %1(<2 x s16>), %6(<2 x s16>) + %8:_(<3 x s16>), %9:_(<3 x s16>) = G_UNMERGE_VALUES %7(<6 x s16>) + %10:_(<6 x s16>) = G_CONCAT_VECTORS %2(<2 x s16>), %3(<2 x s16>), %6(<2 x s16>) + %11:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %10(<6 x s16>) + %13:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %6(<2 x s16>) + %14:_(<3 x s16>), %15:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>) + %16:_(<3 x s16>) = G_FSHR %8, %11, %14(<3 x s16>) + %17:_(<3 x s16>) = G_IMPLICIT_DEF + %18:_(<6 x s16>) = G_CONCAT_VECTORS %16(<3 x s16>), %17(<3 x s16>) + %19:_(<2 x s16>), %20:_(<2 x s16>), %21:_(<2 x s16>) = G_UNMERGE_VALUES %18(<6 x s16>) + $vgpr0 = COPY %19(<2 x s16>) + $vgpr1 = COPY %20(<2 x s16>) + $vgpr2 = COPY %21(<2 x s16>) +... + +--- +name: test_fshr_v4s16_v4s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5 + + ; SI-LABEL: name: test_fshr_v4s16_v4s16 + ; SI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; SI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; SI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 + ; SI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; SI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; SI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) + ; SI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; SI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; SI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; SI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; SI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; SI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; SI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; SI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; SI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; SI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] + ; SI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[AND]](s16) + ; SI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[BITCAST]](s32) + ; SI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[ZEXT]](s32) + ; SI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; SI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; SI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; SI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[BITCAST1]](s32) + ; SI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C5]] + ; SI: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32) + ; SI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16) + ; SI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; SI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]] + ; SI: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32) + ; SI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; SI: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]] + ; SI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; SI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; SI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] + ; SI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16) + ; SI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; SI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[ZEXT2]](s32) + ; SI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; SI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; SI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C5]] + ; SI: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32) + ; SI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16) + ; SI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; SI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C5]] + ; SI: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32) + ; SI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32) + ; SI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]] + ; SI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; SI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; SI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[BITCAST2]](s32) + ; SI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[COPY10]](s32) + ; SI: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; SI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[COPY12]](s32) + ; SI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C1]](s32) + ; SI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY14]], [[SHL4]] + ; SI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; SI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]] + ; SI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) + ; SI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; SI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; SI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; SI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]] + ; SI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]] + ; SI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] + ; SI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16) + ; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16) + ; SI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32) + ; SI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32) + ; SI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY17:%[0-9]+]]:_(s32) = COPY [[SHL2]](s32) + ; SI: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY17]], [[C5]] + ; SI: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY16]](s32) + ; SI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16) + ; SI: [[COPY18:%[0-9]+]]:_(s32) = COPY [[LSHR8]](s32) + ; SI: [[AND11:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C5]] + ; SI: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32) + ; SI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; SI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]] + ; SI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]] + ; SI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]] + ; SI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] + ; SI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16) + ; SI: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16) + ; SI: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32) + ; SI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32) + ; SI: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY20:%[0-9]+]]:_(s32) = COPY [[SHL3]](s32) + ; SI: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY20]], [[C5]] + ; SI: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY19]](s32) + ; SI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16) + ; SI: [[COPY21:%[0-9]+]]:_(s32) = COPY [[LSHR10]](s32) + ; SI: [[AND15:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C5]] + ; SI: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32) + ; SI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; SI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]] + ; SI: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; SI: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; SI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C1]](s32) + ; SI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL7]] + ; SI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; SI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; SI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32) + ; SI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32) + ; SI: [[AND16:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; SI: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; SI: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]] + ; SI: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16) + ; SI: [[COPY22:%[0-9]+]]:_(s32) = COPY [[BITCAST6]](s32) + ; SI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[COPY22]], [[ZEXT10]](s32) + ; SI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32) + ; SI: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY24:%[0-9]+]]:_(s32) = COPY [[BITCAST7]](s32) + ; SI: [[AND18:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C5]] + ; SI: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY23]](s32) + ; SI: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16) + ; SI: [[COPY25:%[0-9]+]]:_(s32) = COPY [[LSHR14]](s32) + ; SI: [[AND19:%[0-9]+]]:_(s32) = G_AND [[COPY25]], [[C5]] + ; SI: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[AND19]], [[ZEXT11]](s32) + ; SI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32) + ; SI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]] + ; SI: [[AND20:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; SI: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; SI: [[AND21:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]] + ; SI: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16) + ; SI: [[COPY26:%[0-9]+]]:_(s32) = COPY [[LSHR12]](s32) + ; SI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY26]], [[ZEXT12]](s32) + ; SI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32) + ; SI: [[COPY27:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY28:%[0-9]+]]:_(s32) = COPY [[LSHR13]](s32) + ; SI: [[AND22:%[0-9]+]]:_(s32) = G_AND [[COPY28]], [[C5]] + ; SI: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND22]], [[COPY27]](s32) + ; SI: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND21]](s16) + ; SI: [[COPY29:%[0-9]+]]:_(s32) = COPY [[LSHR16]](s32) + ; SI: [[AND23:%[0-9]+]]:_(s32) = G_AND [[COPY29]], [[C5]] + ; SI: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[AND23]], [[ZEXT13]](s32) + ; SI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32) + ; SI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[TRUNC12]], [[TRUNC13]] + ; SI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; SI: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32) + ; SI: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY31:%[0-9]+]]:_(s32) = COPY [[BITCAST8]](s32) + ; SI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[COPY31]], [[COPY30]](s32) + ; SI: [[COPY32:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY33:%[0-9]+]]:_(s32) = COPY [[LSHR18]](s32) + ; SI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[COPY33]], [[COPY32]](s32) + ; SI: [[COPY34:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[COPY35:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; SI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY35]], [[C1]](s32) + ; SI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY34]], [[SHL12]] + ; SI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; SI: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]] + ; SI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>) + ; SI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32) + ; SI: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32) + ; SI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) + ; SI: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]] + ; SI: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]] + ; SI: [[AND25:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]] + ; SI: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND24]](s16) + ; SI: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR6]](s16) + ; SI: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT14]](s32) + ; SI: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32) + ; SI: [[COPY36:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY37:%[0-9]+]]:_(s32) = COPY [[SHL10]](s32) + ; SI: [[AND26:%[0-9]+]]:_(s32) = G_AND [[COPY37]], [[C5]] + ; SI: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND26]], [[COPY36]](s32) + ; SI: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND25]](s16) + ; SI: [[COPY38:%[0-9]+]]:_(s32) = COPY [[LSHR20]](s32) + ; SI: [[AND27:%[0-9]+]]:_(s32) = G_AND [[COPY38]], [[C5]] + ; SI: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[AND27]], [[ZEXT15]](s32) + ; SI: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32) + ; SI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[TRUNC16]], [[TRUNC17]] + ; SI: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]] + ; SI: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]] + ; SI: [[AND29:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]] + ; SI: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND28]](s16) + ; SI: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR7]](s16) + ; SI: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT3]], [[ZEXT16]](s32) + ; SI: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32) + ; SI: [[COPY39:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI: [[COPY40:%[0-9]+]]:_(s32) = COPY [[SHL11]](s32) + ; SI: [[AND30:%[0-9]+]]:_(s32) = G_AND [[COPY40]], [[C5]] + ; SI: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND30]], [[COPY39]](s32) + ; SI: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND29]](s16) + ; SI: [[COPY41:%[0-9]+]]:_(s32) = COPY [[LSHR22]](s32) + ; SI: [[AND31:%[0-9]+]]:_(s32) = G_AND [[COPY41]], [[C5]] + ; SI: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND31]], [[ZEXT17]](s32) + ; SI: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32) + ; SI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[TRUNC18]], [[TRUNC19]] + ; SI: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; SI: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; SI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT19]], [[C1]](s32) + ; SI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT18]], [[SHL15]] + ; SI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32) + ; SI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>) + ; SI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; VI-LABEL: name: test_fshr_v4s16_v4s16 + ; VI: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; VI: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; VI: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 + ; VI: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; VI: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; VI: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) + ; VI: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1 + ; VI: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; VI: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; VI: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; VI: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; VI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 15 + ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; VI: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 -1 + ; VI: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[XOR]], [[C2]] + ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16) + ; VI: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C]](s16) + ; VI: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[LSHR2]], [[AND1]](s16) + ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[SHL]], [[LSHR3]] + ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; VI: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]] + ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16) + ; VI: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C]](s16) + ; VI: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[LSHR4]], [[AND3]](s16) + ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[SHL1]], [[LSHR5]] + ; VI: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; VI: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C1]](s32) + ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32) + ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[TRUNC4]], [[C]](s16) + ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC5]], [[C]](s16) + ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C1]](s32) + ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[SHL4]] + ; VI: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) + ; VI: [[XOR2:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BITCAST3]] + ; VI: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[XOR2]](<2 x s16>) + ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32) + ; VI: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32) + ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C2]] + ; VI: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC6]], [[C3]] + ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]] + ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[OR]], [[AND4]](s16) + ; VI: [[LSHR8:%[0-9]+]]:_(s16) = G_LSHR [[SHL2]], [[C]](s16) + ; VI: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[LSHR8]], [[AND5]](s16) + ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[SHL5]], [[LSHR9]] + ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C2]] + ; VI: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC7]], [[C3]] + ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]] + ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[OR1]], [[AND6]](s16) + ; VI: [[LSHR10:%[0-9]+]]:_(s16) = G_LSHR [[SHL3]], [[C]](s16) + ; VI: [[LSHR11:%[0-9]+]]:_(s16) = G_LSHR [[LSHR10]], [[AND7]](s16) + ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[SHL6]], [[LSHR11]] + ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16) + ; VI: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL7]] + ; VI: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32) + ; VI: [[BITCAST6:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST6]](s32) + ; VI: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32) + ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) + ; VI: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32) + ; VI: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32) + ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32) + ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; VI: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]] + ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[TRUNC8]], [[AND8]](s16) + ; VI: [[LSHR14:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC10]], [[C]](s16) + ; VI: [[LSHR15:%[0-9]+]]:_(s16) = G_LSHR [[LSHR14]], [[AND9]](s16) + ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[SHL8]], [[LSHR15]] + ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]] + ; VI: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]] + ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]] + ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[TRUNC9]], [[AND10]](s16) + ; VI: [[LSHR16:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC11]], [[C]](s16) + ; VI: [[LSHR17:%[0-9]+]]:_(s16) = G_LSHR [[LSHR16]], [[AND11]](s16) + ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[SHL9]], [[LSHR17]] + ; VI: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST8]](s32) + ; VI: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST8]], [[C1]](s32) + ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR18]](s32) + ; VI: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[TRUNC12]], [[C]](s16) + ; VI: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[TRUNC13]], [[C]](s16) + ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; VI: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C1]](s32) + ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL12]] + ; VI: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32) + ; VI: [[XOR7:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BITCAST9]] + ; VI: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[XOR7]](<2 x s16>) + ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32) + ; VI: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32) + ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32) + ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]] + ; VI: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]] + ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]] + ; VI: [[SHL13:%[0-9]+]]:_(s16) = G_SHL [[OR6]], [[AND12]](s16) + ; VI: [[LSHR20:%[0-9]+]]:_(s16) = G_LSHR [[SHL10]], [[C]](s16) + ; VI: [[LSHR21:%[0-9]+]]:_(s16) = G_LSHR [[LSHR20]], [[AND13]](s16) + ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[SHL13]], [[LSHR21]] + ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]] + ; VI: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]] + ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]] + ; VI: [[SHL14:%[0-9]+]]:_(s16) = G_SHL [[OR7]], [[AND14]](s16) + ; VI: [[LSHR22:%[0-9]+]]:_(s16) = G_LSHR [[SHL11]], [[C]](s16) + ; VI: [[LSHR23:%[0-9]+]]:_(s16) = G_LSHR [[LSHR22]], [[AND15]](s16) + ; VI: [[OR10:%[0-9]+]]:_(s16) = G_OR [[SHL14]], [[LSHR23]] + ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; VI: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C1]](s32) + ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL15]] + ; VI: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32) + ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST11]](<2 x s16>) + ; VI: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9-LABEL: name: test_fshr_v4s16_v4s16 + ; GFX9: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr4_vgpr5 + ; GFX9: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX9: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX9: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY2]](<4 x s16>) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY3]](s32), [[C]](s32) + ; GFX9: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV4]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY4]](s32), [[C1]](s32) + ; GFX9: [[XOR:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV4]], [[BUILD_VECTOR_TRUNC1]] + ; GFX9: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR]], [[BUILD_VECTOR_TRUNC]] + ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY5]](s32), [[C2]](s32) + ; GFX9: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV]], [[BUILD_VECTOR_TRUNC2]](<2 x s16>) + ; GFX9: [[SHL1:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL]], [[AND1]](<2 x s16>) + ; GFX9: [[LSHR:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV2]], [[AND]](<2 x s16>) + ; GFX9: [[OR:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL1]], [[LSHR]] + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[AND2:%[0-9]+]]:_(<2 x s16>) = G_AND [[UV5]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: [[XOR1:%[0-9]+]]:_(<2 x s16>) = G_XOR [[UV5]], [[BUILD_VECTOR_TRUNC4]] + ; GFX9: [[AND3:%[0-9]+]]:_(<2 x s16>) = G_AND [[XOR1]], [[BUILD_VECTOR_TRUNC3]] + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY10]](s32), [[COPY11]](s32) + ; GFX9: [[SHL2:%[0-9]+]]:_(<2 x s16>) = G_SHL [[UV1]], [[BUILD_VECTOR_TRUNC5]](<2 x s16>) + ; GFX9: [[SHL3:%[0-9]+]]:_(<2 x s16>) = G_SHL [[SHL2]], [[AND3]](<2 x s16>) + ; GFX9: [[LSHR1:%[0-9]+]]:_(<2 x s16>) = G_LSHR [[UV3]], [[AND2]](<2 x s16>) + ; GFX9: [[OR1:%[0-9]+]]:_(<2 x s16>) = G_OR [[SHL3]], [[LSHR1]] + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[OR]](<2 x s16>), [[OR1]](<2 x s16>) + ; GFX9: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 + %2:_(<4 x s16>) = COPY $vgpr4_vgpr5 + %3:_(<4 x s16>) = G_FSHR %0, %1, %2 + $vgpr0_vgpr1 = COPY %3 +...