Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -570,6 +570,18 @@ (BFI_INT $x, $y, $z) >; + // 64-bit version + def : AMDGPUPat < + (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), + (REG_SEQUENCE RC64, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0)), + (i32 (EXTRACT_SUBREG $z, sub0))), sub0, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1)), + (i32 (EXTRACT_SUBREG $z, sub1))), sub1) + >; + // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPat < @@ -577,6 +589,18 @@ (BFI_INT $x, $y, $z) >; + // 64-bit version + def : AMDGPUPat < + (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), + (REG_SEQUENCE RC64, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0)), + (i32 (EXTRACT_SUBREG $z, sub0))), sub0, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1)), + (i32 (EXTRACT_SUBREG $z, sub1))), sub1) + >; + def : AMDGPUPat < (fcopysign f32:$src0, f32:$src1), (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1) @@ -610,10 +634,25 @@ // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y -class SHA256MaPattern : AMDGPUPat < - (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) ->; +multiclass SHA256MaPattern { + def : AMDGPUPat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) + >; + + def : AMDGPUPat < + (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))), + (REG_SEQUENCE RC64, + (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0))), + (i32 (EXTRACT_SUBREG $z, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0))), sub0, + (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1))), + (i32 (EXTRACT_SUBREG $z, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1))), sub1) + >; +} // Bitfield extract patterns Index: lib/Target/AMDGPU/EvergreenInstructions.td =================================================================== --- lib/Target/AMDGPU/EvergreenInstructions.td +++ lib/Target/AMDGPU/EvergreenInstructions.td @@ -693,7 +693,7 @@ def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns -def : SHA256MaPattern ; +defm : SHA256MaPattern ; def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1098,6 +1098,7 @@ def : IMad24Pat; def : UMad24Pat; +// FIXME: This should only be done for VALU inputs defm : BFIPatterns ; def : ROTRPattern ; @@ -1487,7 +1488,7 @@ // FIXME: defm : BFMPatterns ; defm : BFEPattern ; -def : SHA256MaPattern ; +defm : SHA256MaPattern ; def : IntMed3Pat; def : IntMed3Pat; Index: test/CodeGen/AMDGPU/bfi_int.ll =================================================================== --- test/CodeGen/AMDGPU/bfi_int.ll +++ test/CodeGen/AMDGPU/bfi_int.ll @@ -1,14 +1,14 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s -; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI %s +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s ; BFI_INT Definition pattern from ISA docs ; (y & x) | (z & ~x) ; -; R600: {{^}}bfi_def: +; FUNC-LABEL: {{^}}bfi_def: ; R600: BFI_INT -; SI: @bfi_def -; SI: v_bfi_b32 + +; GCN: v_bfi_b32 define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = xor i32 %x, -1 @@ -21,10 +21,10 @@ ; SHA-256 Ch function ; z ^ (x & (y ^ z)) -; R600: {{^}}bfi_sha256_ch: +; FUNC-LABEL: {{^}}bfi_sha256_ch: ; R600: BFI_INT -; SI: @bfi_sha256_ch -; SI: v_bfi_b32 + +; GCN: v_bfi_b32 define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = xor i32 %y, %z @@ -36,12 +36,12 @@ ; SHA-256 Ma function ; ((x & z) | (y & (x | z))) -; R600: {{^}}bfi_sha256_ma: +; FUNC-LABEL: {{^}}bfi_sha256_ma: ; R600: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W ; R600: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W -; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} -; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} +; GCN: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}} +; GCN: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}} define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { entry: %0 = and i32 %x, %z @@ -51,3 +51,137 @@ store i32 %3, i32 addrspace(1)* %out ret void } + +; FUNC-LABEL: {{^}}v_bitselect_v2i32_pat1: +; GCN: s_waitcnt +; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 +; GCN-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GCN-NEXT: s_setpc_b64 +define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) { + %xor.0 = xor <2 x i32> %a, %mask + %and = and <2 x i32> %xor.0, %b + %bitselect = xor <2 x i32> %and, %mask + ret <2 x i32> %bitselect +} + +; FUNC-LABEL: {{^}}v_bitselect_i64_pat_0: +; GCN: s_waitcnt +; GCN-NEXT: v_bfi_b32 v1, v1, v3, v5 +; GCN-NEXT: v_bfi_b32 v0, v0, v2, v4 +; GCN-NEXT: s_setpc_b64 +define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { + %and0 = and i64 %a, %b + %not.a = xor i64 %a, -1 + %and1 = and i64 %not.a, %mask + %bitselect = or i64 %and0, %and1 + ret i64 %bitselect +} + +; FUNC-LABEL: {{^}}v_bitselect_i64_pat_1: +; GCN: s_waitcnt +; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 +; GCN-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GCN-NEXT: s_setpc_b64 +define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { + %xor.0 = xor i64 %a, %mask + %and = and i64 %xor.0, %b + %bitselect = xor i64 %and, %mask + ret i64 %bitselect +} + +; FUNC-LABEL: {{^}}v_bitselect_i64_pat_2: +; GCN: s_waitcnt +; GCN-DAG: v_bfi_b32 v0, v2, v0, v4 +; GCN-DAG: v_bfi_b32 v1, v3, v1, v5 +; GCN-NEXT: s_setpc_b64 +define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { + %xor.0 = xor i64 %a, %mask + %and = and i64 %xor.0, %b + %bitselect = xor i64 %and, %mask + ret i64 %bitselect +} + +; FUNC-LABEL: {{^}}v_bfi_sha256_ma_i64: +; GCN-DAG: v_xor_b32_e32 v1, v1, v3 +; GCN-DAG: v_xor_b32_e32 v0, v0, v2 +; GCN-DAG: v_bfi_b32 v1, v1, v5, v3 +; GCN-DAG: v_bfi_b32 v0, v0, v4, v2 +define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { +entry: + %and0 = and i64 %x, %z + %or0 = or i64 %x, %z + %and1 = and i64 %y, %or0 + %or1 = or i64 %and0, %and1 + ret i64 %or1 +} + +; FIXME: Should leave as 64-bit SALU ops +; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_bfi_b32 +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_bfi_b32 +define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { + %and0 = and i64 %a, %b + %not.a = xor i64 %a, -1 + %and1 = and i64 %not.a, %mask + %bitselect = or i64 %and0, %and1 + %scalar.use = add i64 %bitselect, 10 + store i64 %scalar.use, i64 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_bfi_b32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_bfi_b32 +define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) { + %xor.0 = xor i64 %a, %mask + %and = and i64 %xor.0, %b + %bitselect = xor i64 %and, %mask + + %scalar.use = add i64 %bitselect, 10 + store i64 %scalar.use, i64 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}s_bitselect_i64_pat_2: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_bfi_b32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN: v_bfi_b32 +define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) { + %xor.0 = xor i64 %a, %mask + %and = and i64 %xor.0, %b + %bitselect = xor i64 %and, %mask + + %scalar.use = add i64 %bitselect, 10 + store i64 %scalar.use, i64 addrspace(1)* undef + ret void +} + +; FUNC-LABEL: {{^}}s_bfi_sha256_ma_i64: +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_xor_b32 +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s +; GCN-DAG: v_xor_b32 +; GCN-DAG: v_bfi_b32 +; GCN: v_bfi_b32 +define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) { +entry: + %and0 = and i64 %x, %z + %or0 = or i64 %x, %z + %and1 = and i64 %y, %or0 + %or1 = or i64 %and0, %and1 + + %scalar.use = add i64 %or1, 10 + store i64 %scalar.use, i64 addrspace(1)* undef + ret void +}