diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2577,6 +2577,23 @@ (V_BFE_I32_e64 $src, (i32 0), $width) >; +// An OpenCL front-end, as per +// https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_C.html#operators-shift +// , can emit (and (sub 32, i32:$x) 31) instead of (sub 32, i32:$x) as the +// second operand of a 32 bit shift expression. This operand can be transformed +// to (and (sub 0, i32:$x), 31) by the optimizer. +def : AMDGPUPat < + (DivergentBinFrag (shl_oneuse i32:$src, (and (sub 0, i32:$width), 31)), + (and (sub 0, i32:$width), 31)), + (V_BFE_U32_e64 $src, (i32 0), $width) +>; + +def : AMDGPUPat < + (DivergentBinFrag (shl_oneuse i32:$src, (and (sub 0, i32:$width), 31)), + (and (sub 0, i32:$width), 31)), + (V_BFE_I32_e64 $src, (i32 0), $width) +>; + // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -19,6 +19,16 @@ ret void } +; GCN-LABEL: {{^}}shl_mask: +; GCN: v_bfe_i32 v0, v0, 0, v1 +define i32 @shl_mask(i32 %a, i32 %bits) { + %sub = sub i32 0, %bits + %shl.mask = and i32 %sub, 31 + %shl = shl i32 %a, %shl.mask + %shr = ashr i32 %shl, %shl.mask + ret i32 %shr +} + ; GCN-LABEL: {{^}}v_ubfe_sub_multi_use_shl_i32: ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]