Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2955,6 +2955,50 @@ } } + // Reduce bit extract of low half of an integer to the narrower type. + // (and (srl i64:x, K), KMask) -> + // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) + if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { + if (ConstantSDNode *CAnd = dyn_cast(N1)) { + if (ConstantSDNode *CShift = dyn_cast(N0.getOperand(1))) { + unsigned Size = VT.getSizeInBits(); + const APInt &AndMask = CAnd->getAPIntValue(); + unsigned ShiftBits = CShift->getZExtValue(); + unsigned MaskBits = AndMask.countTrailingOnes(); + EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); + + if (APIntOps::isMask(AndMask) && + // Required bits must not span the two halves of the integer and + // must fit in the half size type. + (ShiftBits + MaskBits <= Size / 2) && + TLI.isNarrowingProfitable(VT, HalfVT) && + TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && + TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && + TLI.isTruncateFree(VT, HalfVT) && + TLI.isZExtFree(HalfVT, VT)) { + // The isNarrowingProfitable is to avoid regressions on PPC and + // AArch64 which match a few 64-bit bit insert / bit extract patterns + // on downstream users of this. Those patterns could probably be + // extended to handle extensions mixed in. + + SDValue SL(N0); + assert(ShiftBits != 0 && MaskBits <= Size); + + // Extracting the highest bit of the low half. + EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, + N0.getOperand(0)); + + SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); + SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); + SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); + SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); + return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); + } + } + } + } + return SDValue(); } Index: test/CodeGen/AArch64/arm64-bitfield-extract.ll =================================================================== --- test/CodeGen/AArch64/arm64-bitfield-extract.ll +++ test/CodeGen/AArch64/arm64-bitfield-extract.ll @@ -41,7 +41,7 @@ define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp { ; CHECK-LABEL: fct1: -; CHECK: ubfx +; CHECK: ubfx x{{[0-9]+}}, x{{[0-9]+}} ; CHECK-NOT: and ; CHECK: ret Index: test/CodeGen/AMDGPU/cgp-bitfield-extract.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -223,12 +223,10 @@ ; GCN: s_cbranch_vccnz BB4_2 -; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 15 -; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0xff +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000f ; GCN: BB4_2: -; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 15 -; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7f +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7000f ; GCN: BB4_3: ; GCN: buffer_store_dwordx2 Index: test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -0,0 +1,116 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Extract the high bit of the 1st quarter +; GCN-LABEL: {{^}}v_uextract_bit_31_i128: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: s_endpgm +define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x + %ld.64 = load i128, i128 addrspace(1)* %in.gep + %srl = lshr i128 %ld.64, 31 + %bit = and i128 %srl, 1 + store i128 %bit, i128 addrspace(1)* %out.gep + ret void +} + +; Extract the high bit of the 2nd quarter +; GCN-LABEL: {{^}}v_uextract_bit_63_i128: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: s_endpgm +define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x + %ld.64 = load i128, i128 addrspace(1)* %in.gep + %srl = lshr i128 %ld.64, 63 + %bit = and i128 %srl, 1 + store i128 %bit, i128 addrspace(1)* %out.gep + ret void +} + +; Extract the high bit of the 3rd quarter +; GCN-LABEL: {{^}}v_uextract_bit_95_i128: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: s_endpgm +define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x + %ld.64 = load i128, i128 addrspace(1)* %in.gep + %srl = lshr i128 %ld.64, 95 + %bit = and i128 %srl, 1 + store i128 %bit, i128 addrspace(1)* %out.gep + ret void +} + +; Extract the high bit of the 4th quarter +; GCN-LABEL: {{^}}v_uextract_bit_127_i128: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: s_endpgm +define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x + %ld.64 = load i128, i128 addrspace(1)* %in.gep + %srl = lshr i128 %ld.64, 127 + %bit = and i128 %srl, 1 + store i128 %bit, i128 addrspace(1)* %out.gep + ret void +} + +; Spans more than 2 dword boundaries +; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128: +; GCN: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN: buffer_load_dword v[[VAL1:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} + +; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[}}[[VAL2]]:[[VAL3]]{{\]}}, 30 +; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v[[VAL1]] +; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] +; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}} + +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ELT2PART]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: s_endpgm +define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x + %ld.64 = load i128, i128 addrspace(1)* %in.gep + %srl = lshr i128 %ld.64, 34 + %bit = and i128 %srl, 73786976294838206463 + store i128 %bit, i128 addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -0,0 +1,387 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half. + +; Extract the high bit of the low half +; GCN-LABEL: {{^}}v_uextract_bit_31_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 31 + %bit = and i64 %srl, 1 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; Extract the high bit of the high half +; GCN-LABEL: {{^}}v_uextract_bit_63_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 63 + %bit = and i64 %srl, 1 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_1_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 1 + %bit = and i64 %srl, 1 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_20_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 20 + %bit = and i64 %srl, 1 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_32_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 32 + %bit = and i64 %srl, 1 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_33_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 33 + %bit = and i64 %srl, 1 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 20 + %bit = and i64 %srl, 3 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 1 + %bit = and i64 %srl, 1073741823 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 1 + %bit = and i64 %srl, 2147483647 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; Spans the dword boundary, so requires full shift +; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64: +; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31 +; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 31 + %bit = and i64 %srl, 3 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 33 + %bit = and i64 %srl, 3 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64: +; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30 +; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 30 + %bit = and i64 %srl, 1073741823 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 +; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 33 + %bit = and i64 %srl, 1073741823 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64: +; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31 +; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}} +; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} +define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 31 + %and = and i64 %srl, 4294967295 + store i64 %and, i64 addrspace(1)* %out + ret void +} + +; trunc applied before and mask +; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: buffer_store_dword v[[SHIFT]] +define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 31 + %trunc = trunc i64 %srl to i32 + %bit = and i32 %trunc, 1 + store i32 %bit, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}} +; GCN: buffer_store_dword [[BFE]] +define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 3 + %trunc = trunc i64 %srl to i32 + %bit = and i32 %trunc, 1 + store i32 %bit, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}} +; GCN: buffer_store_dword [[BFE]] +define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 33 + %trunc = trunc i64 %srl to i32 + %bit = and i32 %trunc, 1 + store i32 %bit, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32: +; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31 +; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]] +; GCN-NOT: v[[SHRLO]] +; GCN: buffer_store_dword v[[SHRLO]] +define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 31 + %trunc = trunc i64 %srl to i32 + %bit = and i32 %trunc, 3 + store i32 %bit, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}and_not_mask_i64: +; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 20 +; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, v[[SHRLO]] +; GCN-DAG: v_mov_b32_e32 v[[SHRHI]], 0{{$}} +; GCN-NOT: v[[SHRLO]] +; GCN-NOT: v[[SHRHI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} +define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 20 + %bit = and i64 %srl, 4 + store i64 %bit, i64 addrspace(1)* %out.gep + ret void +} + +; The instruction count is the same with/without hasOneUse, but +; keeping the 32-bit and has a smaller encoding size than the bfe. + +; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64: +; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27 +; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} +; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} +define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 27 + %bit = and i64 %srl, 3 + store volatile i64 %srl, i64 addrspace(1)* %out + store volatile i64 %bit, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]] +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 +; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO]]{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO2]]{{\]}} +define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 34 + %bit = and i64 %srl, 7 + store volatile i64 %srl, i64 addrspace(1)* %out + store volatile i64 %bit, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: +; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +; GCN: buffer_store_dword v[[ZERO]] +define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x + %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x + %out1.gep = getelementptr i32, i32 addrspace(1)* %out1, i32 %id.x + %ld.64 = load i64, i64 addrspace(1)* %in.gep + %srl = lshr i64 %ld.64, 33 + %bit = and i64 %srl, 7 + store volatile i64 %bit, i64 addrspace(1)* %out0.gep + + %srl.srl32 = lshr i64 %srl, 32 + %srl.hi = trunc i64 %srl.srl32 to i32 + store volatile i32 %srl.hi, i32 addrspace(1)* %out1.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/X86/bmi.ll =================================================================== --- test/CodeGen/X86/bmi.ll +++ test/CodeGen/X86/bmi.ll @@ -142,7 +142,7 @@ %2 = and i64 %1, 4095 ret i64 %2 ; CHECK-LABEL: bextr64b: -; CHECK: bextrq +; CHECK: bextrl } define i64 @bextr64b_load(i64* %x) { @@ -151,7 +151,7 @@ %3 = and i64 %2, 4095 ret i64 %3 ; CHECK-LABEL: bextr64b_load: -; CHECK: bextrq {{.*}}, ({{.*}}), {{.*}} +; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}} } define i32 @non_bextr32(i32 %x) {