diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2510,6 +2510,26 @@ let SubtargetPredicate = NotHasAddNoCarryInsts; } +// Eliminate `and` for constrained shift since the target shift instructions are +// constrained. +multiclass ConstrainedShiftPat { + defvar mask = !sub(width, 1); + defvar ty = !cast("i" # width); + + def : AMDGPUPat <(shl ty:$a, (and ty:$b, mask)), + (!cast("V_LSHLREV_B" # width # "_e64") $b, $a)>; + + def : AMDGPUPat <(srl ty:$a, (and ty:$b, mask)), + (!cast("V_LSHRREV_B" # width # "_e64") $b, $a)>; + + def : AMDGPUPat <(sra ty:$a, (and ty:$b, mask)), + (!cast("V_ASHRREV_I" # width # "_e64") $b, $a)>; +} + +// FIXME: 64 bit pattern match is not generated. +foreach i = [16, 32, 64] in { + defm : ConstrainedShiftPat; +} // Avoid pointlessly materializing a constant in VGPR. // FIXME: Should also do this for readlane, but tablegen crashes on diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -639,10 +639,8 @@ ; VI-LABEL: v_fshr_i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_xor_b32_e32 v3, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 +; VI-NEXT: v_xor_b32_e32 v3, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -651,10 +649,8 @@ ; GFX9-LABEL: v_fshr_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 @@ -669,10 +665,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -701,18 +695,14 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; VI-NEXT: v_and_b32_e32 v4, 15, v3 ; VI-NEXT: v_mov_b32_e32 v5, 1 -; VI-NEXT: v_xor_b32_e32 v3, -1, v3 +; VI-NEXT: v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 -; VI-NEXT: v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v3, -1, v3 ; VI-NEXT: v_lshlrev_b16_e32 v3, v3, v5 ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_xor_b32_e32 v4, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v4, 15, v4 -; VI-NEXT: v_and_b32_e32 v2, 15, v2 +; VI-NEXT: v_xor_b32_e32 v4, -1, v2 ; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 ; VI-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -779,27 +769,21 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; VI-NEXT: v_and_b32_e32 v7, 15, v6 ; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 +; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v6, 15, v6 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v6, -1, v6 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v8 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_xor_b32_e32 v7, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; VI-NEXT: v_and_b32_e32 v7, 15, v7 -; VI-NEXT: v_and_b32_e32 v5, 15, v5 +; VI-NEXT: v_xor_b32_e32 v7, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 +; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v4 -; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] @@ -808,27 +792,21 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 ; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v8 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v7, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0 @@ -844,31 +822,25 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX10-NEXT: v_and_b32_e32 v9, 15, v6 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 -; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v6 +; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_lshrrev_b16 v4, v6, v9 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v6, v10 -; GFX10-NEXT: v_and_b32_e32 v7, 15, v11 +; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX10-NEXT: v_or_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3 +; GFX10-NEXT: v_lshlrev_b16 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) ret <3 x i16> %ret @@ -905,34 +877,26 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; VI-NEXT: v_and_b32_e32 v7, 15, v6 ; VI-NEXT: v_mov_b32_e32 v8, 1 -; VI-NEXT: v_xor_b32_e32 v6, -1, v6 +; VI-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v6, 15, v6 -; VI-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_xor_b32_e32 v6, -1, v6 ; VI-NEXT: v_lshlrev_b16_e32 v6, v6, v9 ; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_and_b32_e32 v9, 15, v7 -; VI-NEXT: v_xor_b32_e32 v7, -1, v7 +; VI-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_and_b32_e32 v7, 15, v7 +; VI-NEXT: v_xor_b32_e32 v7, -1, v7 ; VI-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; VI-NEXT: v_xor_b32_e32 v8, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; VI-NEXT: v_and_b32_e32 v8, 15, v8 -; VI-NEXT: v_and_b32_e32 v5, 15, v5 +; VI-NEXT: v_xor_b32_e32 v8, -1, v5 ; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 ; VI-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v3 +; VI-NEXT: v_xor_b32_e32 v3, -1, v4 ; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; VI-NEXT: v_and_b32_e32 v3, 15, v4 -; VI-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; VI-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; VI-NEXT: v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -943,34 +907,26 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v6 ; GFX9-NEXT: v_mov_b32_e32 v8, 1 -; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v6 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v9 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v9, 15, v7 -; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v7 ; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v8, v1 ; GFX9-NEXT: v_lshrrev_b16_e32 v3, v5, v3 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 -; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v4, v2 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 @@ -989,40 +945,32 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6 -; GFX10-NEXT: v_and_b32_e32 v6, 15, v6 -; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 -; GFX10-NEXT: v_and_b32_e32 v13, 15, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 +; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11 -; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 -; GFX10-NEXT: v_and_b32_e32 v4, 15, v4 -; GFX10-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX10-NEXT: v_and_b32_e32 v8, 15, v8 -; GFX10-NEXT: v_and_b32_e32 v9, 15, v9 -; GFX10-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 +; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 +; GFX10-NEXT: v_lshlrev_b16 v1, v12, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12 -; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11 +; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10 +; GFX10-NEXT: v_lshlrev_b16 v5, v13, v8 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v3, v7, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v6, v7 ; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 ; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -245,8 +245,7 @@ } ; GCN-LABEL: {{^}}trunc_shl_and31: -; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31 -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}} +; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; GCN-NOT: v_lshl_b64 ; GCN-NOT: v_lshlrev_b64 define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) { diff --git a/llvm/test/CodeGen/AMDGPU/shift-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-opts.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shift-opts.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx908 < %s | FileCheck %s + +define i32 @constrained_shift(i32 %a, i32 %b) { +; CHECK-LABEL: constrained_shift: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e32 v2, v1, v0 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, v1, v0 +; CHECK-NEXT: v_add3_u32 v0, v2, v3, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %and = and i32 %b, 31 + %shl = shl i32 %a, %and + %lshr = lshr i32 %a, %and + %ashr = ashr i32 %a, %and + %ret.0 = add i32 %shl, %lshr + %ret = add i32 %ret.0, %ashr + ret i32 %ret +}