diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -583,6 +583,9 @@ /// Match: shr (shl x, n), k -> sbfx/ubfx x, pos, width bool matchBitfieldExtractFromShr(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Match: shr (and x, n), k -> ubfx x, pos, width + bool matchBitfieldExtractFromShrAnd(MachineInstr &MI, BuildFnTy &MatchInfo); + // Helpers for reassociation: bool matchReassocConstantInnerRHS(GPtrAdd &MI, MachineInstr *RHS, BuildFnTy &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -691,9 +691,16 @@ [{ return Helper.matchBitfieldExtractFromShr(*${root}, ${info}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>; +def bitfield_extract_from_shr_and : GICombineRule< + (defs root:$root, build_fn_matchinfo:$info), + (match (wip_match_opcode G_ASHR, G_LSHR):$root, + [{ return Helper.matchBitfieldExtractFromShrAnd(*${root}, ${info}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>; + def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, bitfield_extract_from_and, - bitfield_extract_from_shr]>; + bitfield_extract_from_shr, + bitfield_extract_from_shr_and]>; def udiv_by_const : GICombineRule< (defs root:$root), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4124,6 +4124,55 @@ return true; } +bool CombinerHelper::matchBitfieldExtractFromShrAnd( + MachineInstr &MI, std::function &MatchInfo) { + const unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_ASHR); + + const Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal( + TargetOpcode::G_UBFX, Ty, Ty)) + return false; + + // Try to match shr (and x, c1), c2 + Register AndSrc; + int64_t ShrAmt; + int64_t SMask; + if (!mi_match(Dst, MRI, + m_BinOp(Opcode, + m_OneNonDBGUse(m_GAnd(m_Reg(AndSrc), m_ICst(SMask))), + m_ICst(ShrAmt)))) + return false; + + const unsigned Size = Ty.getScalarSizeInBits(); + if (ShrAmt < 0 || ShrAmt >= Size) + return false; + + // Check that ubfx can do the extraction, with no holes in the mask. + uint64_t UMask = SMask; + UMask |= maskTrailingOnes(ShrAmt); + UMask &= maskTrailingOnes(Size); + if (!isMask_64(UMask)) + return false; + + // Calculate start position and width of the extract. + const int64_t Pos = ShrAmt; + const int64_t Width = countTrailingOnes(UMask) - ShrAmt; + + // It's preferable to keep the shift, rather than form G_SBFX. + // TODO: remove the G_AND via demanded bits analysis. + if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto WidthCst = B.buildConstant(Ty, Width); + auto PosCst = B.buildConstant(Ty, Pos); + B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst}); + }; + return true; +} + bool CombinerHelper::reassociationCanBreakAddressingModePattern( MachineInstr &PtrAdd) { assert(PtrAdd.getOpcode() == TargetOpcode::G_PTR_ADD); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/form-bitfield-extract-from-shr-and.mir @@ -0,0 +1,190 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +# Check that we can fold a G_ASHR/G_LSHR fed by a G_AND into a G_SBFX/G_UBFX. + +--- +name: mask_extract_unsigned_32 +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: mask_extract_unsigned_32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s32) = G_UBFX [[COPY]], [[C]](s32), [[C]] + ; CHECK-NEXT: $w0 = COPY [[UBFX]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 12 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_LSHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: mask_extract_unsigned_64 +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: mask_extract_unsigned_64 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[COPY]], [[C]](s64), [[C1]] + ; CHECK-NEXT: $x0 = COPY [[UBFX]](s64) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 1080863910568919040 + %2:_(s64) = G_CONSTANT i64 56 + %3:_(s64) = G_AND %0, %1 + %4:_(s64) = G_LSHR %3, %2 + $x0 = COPY %4(s64) +... +--- +name: no_mask_extract_unsigned_128 +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: no_mask_extract_unsigned_128 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 1080863910568919040 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s128) = G_CONSTANT i128 56 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s128) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s128) = G_LSHR [[AND]], [[C1]](s128) + ; CHECK-NEXT: $q0 = COPY [[LSHR]](s128) + %0:_(s128) = COPY $q0 + %1:_(s128) = G_CONSTANT i128 1080863910568919040 + %2:_(s128) = G_CONSTANT i128 56 + %3:_(s128) = G_AND %0, %1 + %4:_(s128) = G_LSHR %3, %2 + $q0 = COPY %4(s128) +... +--- +name: mask_extract_asr +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: mask_extract_asr + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]] + ; CHECK-NEXT: $w0 = COPY [[UBFX]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 1610612736 + %2:_(s32) = G_CONSTANT i32 29 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_ASHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: no_mask_extract_asr +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: no_mask_extract_asr + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1073741824 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[AND]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[ASHR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 3221225472 + %2:_(s32) = G_CONSTANT i32 30 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_ASHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: mask_extract_signed_nonneg +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: mask_extract_signed_nonneg + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[UBFX:%[0-9]+]]:_(s32) = G_UBFX [[COPY]], [[C]](s32), [[C1]] + ; CHECK-NEXT: $w0 = COPY [[UBFX]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 2147483647 + %2:_(s32) = G_CONSTANT i32 29 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_ASHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: no_mask_extract_large_shift +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: no_mask_extract_large_shift + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 33 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 12 + %2:_(s32) = G_CONSTANT i32 33 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_LSHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: no_mask_extract_negative_shift +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: no_mask_extract_negative_shift + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 12 + %2:_(s32) = G_CONSTANT i32 -1 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_LSHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: no_mask_extract_disjoint +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: no_mask_extract_disjoint + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 26 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 26 + %2:_(s32) = G_CONSTANT i32 1 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_LSHR %3, %2 + $w0 = COPY %4(s32) +... +--- +name: no_mask_extract_extra_bits +legalized: true +body: | + bb.0.entry: + ; CHECK-LABEL: name: no_mask_extract_extra_bits + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32) + ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_CONSTANT i32 25 + %2:_(s32) = G_CONSTANT i32 2 + %3:_(s32) = G_AND %0, %1 + %4:_(s32) = G_LSHR %3, %2 + $w0 = COPY %4(s32) +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll @@ -277,9 +277,8 @@ define dso_local void @invalid_shift(i16 %x, i8* %p) { ; CHECK-LABEL: invalid_shift: ; CHECK: ; %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ubfx w8, w0, #4, #12 ; CHECK-NEXT: strb w0, [x1] -; CHECK-NEXT: lsr w8, w8, #4 ; CHECK-NEXT: strb w8, [x1, #1] ; CHECK-NEXT: ret %t1 = trunc i16 %x to i8 @@ -316,9 +315,8 @@ define dso_local void @different_base_reg(i16 %x, i8* %p, i8 *%p2) { ; CHECK-LABEL: different_base_reg: ; CHECK: ; %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ubfx w8, w0, #8, #8 ; CHECK-NEXT: strb w0, [x1] -; CHECK-NEXT: lsr w8, w8, #8 ; CHECK-NEXT: strb w8, [x2, #1] ; CHECK-NEXT: ret %t1 = trunc i16 %x to i8 @@ -333,9 +331,8 @@ define dso_local void @second_store_is_volatile(i16 %x, i8* %p) { ; CHECK-LABEL: second_store_is_volatile: ; CHECK: ; %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ubfx w8, w0, #8, #8 ; CHECK-NEXT: strb w0, [x1] -; CHECK-NEXT: lsr w8, w8, #8 ; CHECK-NEXT: strb w8, [x1, #1] ; CHECK-NEXT: ret %t1 = trunc i16 %x to i8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -392,8 +392,7 @@ ; GFX7-LABEL: s_bswap_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_lshl_b32 s1, s0, 8 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff -; GFX7-NEXT: s_lshr_b32 s0, s0, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: ; return to shader part epilog ; @@ -427,8 +426,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -459,14 +457,12 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-LABEL: s_bswap_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s3, 0xffff +; GFX7-NEXT: s_mov_b32 s3, 0x80008 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8 -; GFX7-NEXT: s_and_b32 s0, s0, s3 -; GFX7-NEXT: s_lshr_b32 s0, s0, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, s3 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_lshl_b32 s2, s1, 8 -; GFX7-NEXT: s_and_b32 s1, s1, s3 -; GFX7-NEXT: s_lshr_b32 s1, s1, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, s3 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 @@ -505,8 +501,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -541,8 +536,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -579,14 +573,11 @@ ; GFX7-LABEL: v_bswap_v2i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v0 -; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 8, 8 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -687,8 +687,8 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff00, v0 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 +; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -704,8 +704,7 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_and_b32_e32 v0, 0xff00, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v2, v0 +; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -11,10 +11,9 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s3, 0x7f ; GFX6-NEXT: s_and_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -142,6 +141,7 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v3, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 6 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, -7, v3 @@ -149,8 +149,6 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7f -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 @@ -267,10 +265,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX6-LABEL: s_fshl_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_and_b32 s3, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x70001 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -321,9 +318,8 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v3, 7, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -375,9 +371,8 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_fshl_i8_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_lshr_b32 s1, s1, 4 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -415,9 +410,8 @@ ; GFX6-LABEL: v_fshl_i8_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 4, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -455,9 +449,8 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_fshl_i8_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 5 -; GFX6-NEXT: s_lshr_b32 s1, s1, 3 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x50003 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -495,9 +488,8 @@ ; GFX6-LABEL: v_fshl_i8_5: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 3, 5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -536,13 +528,11 @@ ; GFX6-LABEL: s_fshl_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s5, s2, 7 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_lshr_b32 s3, s0, 8 -; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_and_b32 s5, s1, s6 ; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_lshr_b32 s5, s5, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s5 +; GFX6-NEXT: s_bfe_u32 s5, s1, 0x70001 ; GFX6-NEXT: s_lshr_b32 s2, s5, s2 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX6-NEXT: s_or_b32 s0, s0, s2 @@ -551,6 +541,7 @@ ; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: s_lshl_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s4 +; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_or_b32 s1, s2, s1 ; GFX6-NEXT: s_and_b32 s1, s1, s6 ; GFX6-NEXT: s_and_b32 s0, s0, s6 @@ -656,15 +647,13 @@ ; GFX6-LABEL: v_fshl_v2i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 -; GFX6-NEXT: s_movk_i32 s4, 0xff -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX6-NEXT: v_and_b32_e32 v5, 7, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 -; GFX6-NEXT: v_and_b32_e32 v5, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v5, v0 +; GFX6-NEXT: v_bfe_u32 v5, v1, 1, 7 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v4 @@ -675,8 +664,9 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0xff +; GFX6-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -772,17 +762,15 @@ ; GFX6-LABEL: s_fshl_v4i8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s9, s2, 7 -; GFX6-NEXT: s_movk_i32 s10, 0xff ; GFX6-NEXT: s_lshr_b32 s3, s0, 8 ; GFX6-NEXT: s_lshr_b32 s4, s0, 16 ; GFX6-NEXT: s_lshr_b32 s5, s0, 24 -; GFX6-NEXT: s_lshl_b32 s0, s0, s9 -; GFX6-NEXT: s_and_b32 s9, s1, s10 ; GFX6-NEXT: s_lshr_b32 s6, s2, 8 ; GFX6-NEXT: s_lshr_b32 s7, s2, 16 ; GFX6-NEXT: s_lshr_b32 s8, s2, 24 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_lshr_b32 s9, s9, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s9 +; GFX6-NEXT: s_bfe_u32 s9, s1, 0x70001 ; GFX6-NEXT: s_lshr_b32 s2, s9, s2 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s6, 7 @@ -797,6 +785,7 @@ ; GFX6-NEXT: s_bfe_u32 s4, s1, 0x80010 ; GFX6-NEXT: s_andn2_b32 s6, 7, s7 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_movk_i32 s10, 0xff ; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 7 @@ -995,14 +984,13 @@ ; GFX6-NEXT: v_lshrrev_b32_e32 v8, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v9, 7, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX6-NEXT: v_and_b32_e32 v10, 0xff, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 1, v10 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX6-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v9, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v10 +; GFX6-NEXT: v_bfe_u32 v9, v1, 1, 7 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 7, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 @@ -1018,23 +1006,23 @@ ; GFX6-NEXT: v_bfe_u32 v4, v1, 16, 8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 -; GFX6-NEXT: v_mov_b32_e32 v9, 0xff +; GFX6-NEXT: s_movk_i32 s4, 0xff ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 7, v8 ; GFX6-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v4, v5 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v6, v1 -; GFX6-NEXT: v_and_b32_e32 v0, v0, v9 +; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v9 +; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, v1, v9 +; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1215,8 +1203,7 @@ ; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s1, s1, s3 -; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1247,8 +1234,7 @@ ; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, s3 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -1279,8 +1265,7 @@ ; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -1305,10 +1290,9 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, 0xffffffe8, v0 @@ -1342,6 +1326,7 @@ ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffffe8 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_mul_lo_u32 v4, v4, v3 @@ -1349,8 +1334,6 @@ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 @@ -1374,6 +1357,7 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffffe8 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_lo_u32 v4, v4, v3 @@ -1381,8 +1365,6 @@ ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 @@ -1406,6 +1388,7 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffffe8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX9-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 @@ -1413,8 +1396,6 @@ ; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffff -; GFX9-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 @@ -1436,16 +1417,14 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 @@ -1454,10 +1433,11 @@ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff +; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1998,41 +1978,40 @@ ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX6-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX6-NEXT: v_mul_lo_u32 v8, v7, v6 ; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v9 +; GFX6-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX6-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX6-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX6-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX6-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX6-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v8, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 24, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v7, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX6-NEXT: v_mul_hi_u32 v6, v8, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, v5, v9 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v6 -; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX6-NEXT: v_and_b32_e32 v6, v7, v9 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX6-NEXT: v_and_b32_e32 v4, v6, v9 +; GFX6-NEXT: v_mul_lo_u32 v6, v7, 24 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, v5, 24 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v5 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2042,8 +2021,7 @@ ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v9 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 23 ; GFX6-NEXT: v_and_b32_e32 v3, v4, v9 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2055,41 +2033,40 @@ ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX8-NEXT: v_mov_b32_e32 v7, 0xffffffe8 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v9, 24 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX8-NEXT: v_mul_lo_u32 v8, v7, v6 ; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 -; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v8, v9 +; GFX8-NEXT: v_mul_hi_u32 v6, v4, v6 ; GFX8-NEXT: v_mov_b32_e32 v9, 0xffffff -; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX8-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX8-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX8-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX8-NEXT: v_mul_hi_u32 v7, v8, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 24, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, v7, v8 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7 +; GFX8-NEXT: v_mul_hi_u32 v7, v5, v7 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX8-NEXT: v_mul_hi_u32 v6, v8, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, v5, v9 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, v8, v6 -; GFX8-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX8-NEXT: v_and_b32_e32 v6, v7, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v9 +; GFX8-NEXT: v_mul_lo_u32 v6, v7, 24 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_mul_lo_u32 v5, v5, 24 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v4, v5 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2099,8 +2076,7 @@ ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 ; GFX8-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_and_b32_e32 v2, v3, v9 -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_bfe_u32 v2, v3, 1, 23 ; GFX8-NEXT: v_and_b32_e32 v3, v4, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 @@ -2118,7 +2094,9 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GFX9-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX9-NEXT: v_mul_lo_u32 v8, v7, v6 +; GFX9-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX9-NEXT: v_mul_hi_u32 v8, v6, v8 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v9 @@ -2127,25 +2105,23 @@ ; GFX9-NEXT: v_and_b32_e32 v5, v5, v9 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, v8 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 ; GFX9-NEXT: v_mul_hi_u32 v7, v8, v7 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 -; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 -; GFX9-NEXT: v_and_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_add_u32_e32 v6, v8, v7 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, v6 +; GFX9-NEXT: v_sub_u32_e32 v7, 23, v4 +; GFX9-NEXT: v_and_b32_e32 v7, v7, v9 ; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, v7, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, v5, v7 +; GFX9-NEXT: v_sub_u32_e32 v2, v5, v6 ; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -2153,8 +2129,6 @@ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 -; GFX9-NEXT: v_and_b32_e32 v3, v3, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v4, v4, v9 ; GFX9-NEXT: v_and_b32_e32 v2, v2, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, v4, v3 @@ -2169,13 +2143,11 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v6, v6 @@ -2773,9 +2745,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s3, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 -; GFX6-NEXT: s_lshr_b32 s1, s1, 1 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 @@ -2830,9 +2801,8 @@ define amdgpu_ps i16 @s_fshl_i16_4(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_fshl_i16_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_lshr_b32 s1, s1, 12 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x4000c ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -2872,9 +2842,8 @@ define amdgpu_ps i16 @s_fshl_i16_5(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_fshl_i16_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s0, 5 -; GFX6-NEXT: s_lshr_b32 s1, s1, 11 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x5000b ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -2918,9 +2887,8 @@ ; GFX6-NEXT: v_and_b32_e32 v3, 15, v2 ; GFX6-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_bfe_u32 v3, v3, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 @@ -2971,9 +2939,8 @@ ; GFX6-LABEL: v_fshl_i16_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 12, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 12, 4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3009,9 +2976,8 @@ ; GFX6-LABEL: v_fshl_i16_5: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 11, 5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3048,11 +3014,10 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_and_b32 s0, s1, 0xffff -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 @@ -3106,9 +3071,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s1, v0 @@ -3157,9 +3121,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s2, s1, 15 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 -; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 @@ -3214,20 +3177,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_and_b32 s2, s2, s6 -; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 +; GFX6-NEXT: s_mov_b32 s6, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s2, s2, s6 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s2, s3, s6 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3328,22 +3289,19 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 ; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -3409,14 +3367,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s4, 4, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 12, v2 +; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 +; GFX6-NEXT: s_bfe_u32 s4, 11, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 +; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3462,22 +3421,20 @@ ; GFX6-LABEL: v_fshl_v2i16_ssv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_mov_b32 s0, 0xffff +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX6-NEXT: s_and_b32 s2, s2, s0 -; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX6-NEXT: s_mov_b32 s0, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s2, s2, s0 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: s_and_b32 s0, s3, s0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s3, s0 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3554,21 +3511,18 @@ ; GFX6-LABEL: v_fshl_v2i16_svs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s4 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, s0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3641,20 +3595,18 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s0, s0, s4 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s4 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s1, s4 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3757,38 +3709,34 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s12, s8, 15 ; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s12 -; GFX6-NEXT: s_mov_b32 s12, 0xffff ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_and_b32 s4, s4, s12 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_lshl_b32 s0, s0, s12 +; GFX6-NEXT: s_mov_b32 s12, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s4, s4, s12 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_andn2_b32 s8, 15, s9 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s4, s5, s12 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s4, s5, s12 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: s_and_b32 s4, s10, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_andn2_b32 s5, 15, s10 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_and_b32 s4, s6, s12 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s4, s6, s12 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s2, s2, s4 ; GFX6-NEXT: s_and_b32 s4, s11, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_andn2_b32 s5, 15, s11 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_and_b32 s4, s7, s12 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s4, s7, s12 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -3957,43 +3905,37 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v12, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX6-NEXT: v_bfe_u32 v12, v12, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v12, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v9 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v9 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, s4, v5 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 ; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 -; GFX6-NEXT: v_mov_b32_e32 v12, 0xffff ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v10 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v12 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 ; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v11 ; GFX6-NEXT: v_xor_b32_e32 v5, -1, v11 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v12 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 ; GFX6-NEXT: v_bfe_u32 v5, v5, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -370,9 +370,8 @@ define amdgpu_ps i8 @s_fshr_i8_4(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_fshr_i8_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 4 -; GFX6-NEXT: s_lshr_b32 s1, s1, 4 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x40004 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -410,9 +409,8 @@ ; GFX6-LABEL: v_fshr_i8_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 4, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 4, 4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -450,9 +448,8 @@ define amdgpu_ps i8 @s_fshr_i8_5(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-LABEL: s_fshr_i8_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 3 -; GFX6-NEXT: s_lshr_b32 s1, s1, 5 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0x30005 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -490,9 +487,8 @@ ; GFX6-LABEL: v_fshr_i8_5: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 5, 3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -2657,9 +2653,8 @@ define amdgpu_ps i16 @s_fshr_i16_4(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_fshr_i16_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s0, 12 -; GFX6-NEXT: s_lshr_b32 s1, s1, 4 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0xc0004 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -2699,9 +2694,8 @@ define amdgpu_ps i16 @s_fshr_i16_5(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-LABEL: s_fshr_i16_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s0, 11 -; GFX6-NEXT: s_lshr_b32 s1, s1, 5 +; GFX6-NEXT: s_bfe_u32 s1, s1, 0xb0005 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; @@ -2798,9 +2792,8 @@ ; GFX6-LABEL: v_fshr_i16_4: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 12, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 4, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 4, 12 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -2836,9 +2829,8 @@ ; GFX6-LABEL: v_fshr_i16_5: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 11, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 5, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 5, 11 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3036,38 +3028,37 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 -; GFX6-NEXT: s_and_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 ; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 +; GFX6-NEXT: s_mov_b32 s6, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_and_b32 s7, s2, s6 +; GFX6-NEXT: s_bfe_u32 s7, s2, s6 +; GFX6-NEXT: s_bfe_u32 s8, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 -; GFX6-NEXT: s_and_b32 s5, s3, s6 -; GFX6-NEXT: s_lshr_b32 s7, s7, 15 -; GFX6-NEXT: s_lshr_b32 s5, s5, 15 -; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s5, s3, s6 +; GFX6-NEXT: s_lshr_b32 s7, s7, s8 +; GFX6-NEXT: s_lshr_b32 s5, s5, s8 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s7 ; GFX6-NEXT: s_or_b32 s1, s1, s5 +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 ; GFX6-NEXT: s_and_b32 s7, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_and_b32 s2, s2, s6 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s2, s2, s6 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s7 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s3, s6 -; GFX6-NEXT: s_lshr_b32 s2, s2, 1 +; GFX6-NEXT: s_bfe_u32 s2, s3, s6 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3176,41 +3167,38 @@ ; GFX6-LABEL: v_fshr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v6 -; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX6-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v5, s5, v2 +; GFX6-NEXT: v_bfe_u32 v5, v2, 1, 15 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX6-NEXT: v_and_b32_e32 v5, s5, v3 +; GFX6-NEXT: v_bfe_u32 v5, v3, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v5, 15, v5 +; GFX6-NEXT: v_lshrrev_b32_e32 v5, s5, v5 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX6-NEXT: v_and_b32_e32 v7, 15, v4 +; GFX6-NEXT: v_and_b32_e32 v6, 15, v4 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v2 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX6-NEXT: v_bfe_u32 v7, v7, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 +; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 ; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v6, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v5 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_xor_b32_e32 v4, -1, v5 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_and_b32_e32 v2, v3, v6 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 ; GFX6-NEXT: v_bfe_u32 v3, v4, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 @@ -3286,14 +3274,15 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s4, 12, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 4, v2 +; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 15 +; GFX6-NEXT: s_bfe_u32 s4, 3, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: s_bfe_u32 s5, 8, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, s5, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_bfe_u32 v2, v3, 1, 15 +; GFX6-NEXT: s_bfe_u32 s4, 7, 0x100000 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -3341,36 +3330,35 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_mov_b32 s5, 0xffff +; GFX6-NEXT: s_mov_b32 s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_and_b32 s6, s2, s5 +; GFX6-NEXT: s_bfe_u32 s6, s2, s5 +; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s6, s6, 15 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_and_b32 s0, s2, s5 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s2, s5 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s4, s3, s5 -; GFX6-NEXT: s_lshl_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s4, s3, s5 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s4, 15 +; GFX6-NEXT: s_lshr_b32 s4, s4, s7 +; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX6-NEXT: s_and_b32 s0, s3, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s3, s5 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3456,38 +3444,36 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v2, s4, v0 +; GFX6-NEXT: v_bfe_u32 v2, v0, 1, 15 +; GFX6-NEXT: s_bfe_u32 s4, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; GFX6-NEXT: v_bfe_u32 v3, v1, 1, 15 ; GFX6-NEXT: v_or_b32_e32 v2, s0, v2 ; GFX6-NEXT: s_lshl_b32 s0, s1, s3 -; GFX6-NEXT: v_lshrrev_b32_e32 v3, 15, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, s4, v3 ; GFX6-NEXT: v_or_b32_e32 v3, s0, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_xor_b32 s0, s2, -1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_lshr_b32 s1, s0, 16 ; GFX6-NEXT: s_and_b32 s2, s0, 15 ; GFX6-NEXT: s_andn2_b32 s0, 15, s0 -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_bfe_u32 v0, v0, 1, 15 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s1, 15 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s2, v2 ; GFX6-NEXT: s_andn2_b32 s1, 15, s1 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s0, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 15 ; GFX6-NEXT: s_bfe_u32 s0, s1, 0x100000 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 @@ -3573,38 +3559,37 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <2 x i16> inreg %amt) { ; GFX6-LABEL: v_fshr_v2i16_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s4 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 +; GFX6-NEXT: s_mov_b32 s4, 0xf0001 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_and_b32 s5, s0, s4 +; GFX6-NEXT: s_bfe_u32 s5, s0, s4 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 -; GFX6-NEXT: s_and_b32 s3, s1, s4 -; GFX6-NEXT: s_lshr_b32 s5, s5, 15 -; GFX6-NEXT: s_lshr_b32 s3, s3, 15 -; GFX6-NEXT: s_lshl_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s3, s1, s4 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 +; GFX6-NEXT: s_lshr_b32 s3, s3, s6 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 ; GFX6-NEXT: v_or_b32_e32 v0, s5, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 +; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 ; GFX6-NEXT: s_and_b32 s5, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s4 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s0, s4 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_lshl_b32 s1, s1, 1 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s4 -; GFX6-NEXT: s_lshr_b32 s0, s0, 1 +; GFX6-NEXT: s_bfe_u32 s0, s1, s4 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3715,35 +3700,35 @@ ; GFX6-NEXT: s_or_b32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 ; GFX6-NEXT: s_and_b32 s10, s10, s12 +; GFX6-NEXT: s_mov_b32 s11, 0xf0001 ; GFX6-NEXT: s_or_b32 s9, s9, s10 ; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 -; GFX6-NEXT: s_and_b32 s11, s4, s12 +; GFX6-NEXT: s_bfe_u32 s12, s4, s11 +; GFX6-NEXT: s_bfe_u32 s13, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s10 -; GFX6-NEXT: s_lshr_b32 s11, s11, 15 -; GFX6-NEXT: s_or_b32 s0, s0, s11 -; GFX6-NEXT: s_and_b32 s11, s5, s12 +; GFX6-NEXT: s_lshr_b32 s12, s12, s13 +; GFX6-NEXT: s_or_b32 s0, s0, s12 +; GFX6-NEXT: s_bfe_u32 s12, s5, s11 ; GFX6-NEXT: s_lshl_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s11, s11, 15 -; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_lshr_b32 s12, s12, s13 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s11 -; GFX6-NEXT: s_lshr_b32 s11, s8, 16 -; GFX6-NEXT: s_and_b32 s13, s8, 15 +; GFX6-NEXT: s_or_b32 s1, s1, s12 +; GFX6-NEXT: s_lshl_b32 s4, s4, 1 +; GFX6-NEXT: s_lshr_b32 s12, s8, 16 +; GFX6-NEXT: s_and_b32 s14, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_and_b32 s4, s4, s12 -; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s14, s14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, s11 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s13 +; GFX6-NEXT: s_lshl_b32 s0, s0, s14 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s11, 15 +; GFX6-NEXT: s_and_b32 s4, s12, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 +; GFX6-NEXT: s_andn2_b32 s8, 15, s12 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s4, s5, s12 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 +; GFX6-NEXT: s_bfe_u32 s4, s5, s11 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3752,32 +3737,30 @@ ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, s10 -; GFX6-NEXT: s_and_b32 s2, s6, s12 -; GFX6-NEXT: s_lshr_b32 s2, s2, 15 +; GFX6-NEXT: s_bfe_u32 s2, s6, s11 +; GFX6-NEXT: s_lshr_b32 s2, s2, s13 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, s10 -; GFX6-NEXT: s_and_b32 s3, s7, s12 -; GFX6-NEXT: s_lshr_b32 s3, s3, 15 +; GFX6-NEXT: s_bfe_u32 s3, s7, s11 +; GFX6-NEXT: s_lshr_b32 s3, s3, s13 +; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 -; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_lshl_b32 s4, s7, 1 ; GFX6-NEXT: s_lshr_b32 s6, s5, 16 ; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 -; GFX6-NEXT: s_and_b32 s3, s3, s12 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s3, s3, s11 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 ; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s3 ; GFX6-NEXT: s_and_b32 s3, s6, 15 -; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_andn2_b32 s5, 15, s6 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s4, s12 -; GFX6-NEXT: s_lshr_b32 s3, s3, 1 +; GFX6-NEXT: s_bfe_u32 s3, s4, s11 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 @@ -3967,48 +3950,46 @@ ; GFX6-NEXT: v_or_b32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v11 ; GFX6-NEXT: v_and_b32_e32 v10, v10, v12 -; GFX6-NEXT: s_mov_b32 s5, 0xffff ; GFX6-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: v_and_b32_e32 v10, s5, v4 +; GFX6-NEXT: v_bfe_u32 v10, v4, 1, 15 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 15, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v10 -; GFX6-NEXT: v_and_b32_e32 v10, s5, v5 +; GFX6-NEXT: v_bfe_u32 v10, v5, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v10, 15, v10 +; GFX6-NEXT: v_lshrrev_b32_e32 v10, s5, v10 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_lshrrev_b32_e32 v10, 16, v8 ; GFX6-NEXT: v_and_b32_e32 v11, 15, v8 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v4 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX6-NEXT: v_bfe_u32 v11, v11, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v11, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v10 -; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_xor_b32_e32 v8, -1, v10 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 1, v5 ; GFX6-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, v4, v1 -; GFX6-NEXT: v_and_b32_e32 v4, v5, v12 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 ; GFX6-NEXT: v_bfe_u32 v5, v8, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: v_and_b32_e32 v4, v6, v12 +; GFX6-NEXT: v_bfe_u32 v4, v6, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX6-NEXT: v_and_b32_e32 v4, v7, v12 +; GFX6-NEXT: v_bfe_u32 v4, v7, 1, 15 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, s4, v3 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 15, v4 +; GFX6-NEXT: v_lshrrev_b32_e32 v4, s5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 1, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v9 @@ -4017,20 +3998,18 @@ ; GFX6-NEXT: v_and_b32_e32 v8, 15, v6 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX6-NEXT: v_and_b32_e32 v4, v4, v12 ; GFX6-NEXT: v_bfe_u32 v8, v8, 0, 16 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v4, 1, 15 ; GFX6-NEXT: v_bfe_u32 v6, v6, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, v8, v2 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 15, v7 ; GFX6-NEXT: v_xor_b32_e32 v6, -1, v7 -; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX6-NEXT: v_bfe_u32 v4, v4, 0, 16 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, v4, v3 -; GFX6-NEXT: v_and_b32_e32 v4, v5, v12 -; GFX6-NEXT: v_lshrrev_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_bfe_u32 v4, v5, 1, 15 ; GFX6-NEXT: v_bfe_u32 v5, v6, 0, 16 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v5, v4 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -1020,8 +1020,7 @@ ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 0x1c0 -; GFX6-NEXT: s_lshr_b32 s0, s0, 6 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm @@ -1039,8 +1038,7 @@ ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s0, s0, 0x1ff -; GFX6-NEXT: s_lshr_b32 s0, s0, 6 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x30006 ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -42,8 +42,7 @@ ; GFX6-LABEL: v_lshr_i8_7: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GFX6-NEXT: v_bfe_u32 v0, v0, 7, 1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_i8_7: @@ -110,14 +109,12 @@ define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { ; GCN-LABEL: s_lshr_i8_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xff -; GCN-NEXT: s_lshr_b32 s0, s0, 7 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_lshr_i8_7: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xff -; GFX10-NEXT: s_lshr_b32 s0, s0, 7 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x10007 ; GFX10-NEXT: ; return to shader part epilog %result = lshr i8 %value, 7 ret i8 %result @@ -151,16 +148,14 @@ ; GCN-LABEL: v_lshr_i24_7: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 7, 17 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_lshr_i24_7: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 7, v0 +; GFX10-NEXT: v_bfe_u32 v0, v0, 7, 17 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, 7 ret i24 %result @@ -189,14 +184,12 @@ define amdgpu_ps i24 @s_lshr_i24_7(i24 inreg %value) { ; GCN-LABEL: s_lshr_i24_7: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xffffff -; GCN-NEXT: s_lshr_b32 s0, s0, 7 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x110007 ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_lshr_i24_7: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffffff -; GFX10-NEXT: s_lshr_b32 s0, s0, 7 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x110007 ; GFX10-NEXT: ; return to shader part epilog %result = lshr i24 %value, 7 ret i24 %result @@ -664,14 +657,12 @@ define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) { ; GCN-LABEL: s_lshr_i16_15: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b32 s0, s0, 0xffff -; GCN-NEXT: s_lshr_b32 s0, s0, 15 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f ; GCN-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_lshr_i16_15: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s0, s0, 15 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x1000f ; GFX10-NEXT: ; return to shader part epilog %result = lshr i16 %value, 15 ret i16 %result @@ -774,11 +765,8 @@ ; GFX6-LABEL: v_lshr_v2i16_15: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 15, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 15, v1 +; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1 +; GFX6-NEXT: v_bfe_u32 v1, v1, 15, 1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_lshr_v2i16_15: