Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -69,6 +69,7 @@ SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -377,6 +377,7 @@ setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); @@ -2562,7 +2563,45 @@ SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); + + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Zero, Lo); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); +} + +SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + if (!RHS) + return SDValue(); + + unsigned ShiftAmt = RHS->getZExtValue(); + if (ShiftAmt < 32) + return SDValue(); + + // srl (i64 x), C for C >= 32 + // => + // build_pair (srl (i32 hi_32(x)), (C - 32), 0) + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + + SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, + VecOp, One); + + SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); + SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); + + SDValue BuildPair = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + NewShift, Zero); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, @@ -2701,6 +2740,12 @@ return performShlCombine(N, DCI); } + case ISD::SRL: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performSrlCombine(N, DCI); + } case ISD::MUL: return performMulCombine(N, DCI); case AMDGPUISD::MUL_I24: Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -396,8 +396,6 @@ ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] -; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]], 32 -; VI: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, 32, [[LOAD]] ; GCN: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} ; GCN: v_cvt_f32_f16_e32 Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -0,0 +1,64 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=FAST64 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=SLOW64 -check-prefix=GCN %s + + +; lshr (i64 x), c: c > 32 => reg_sequence lshr (i32 hi_32(x)), (c - 32), 0 +; GCN-LABEL: {{^}}lshr_i64_35: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = lshr i64 %val, 35 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lshr_i64_63: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = lshr i64 %val, 63 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lshr_i64_33: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = lshr i64 %val, 33 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lshr_i64_32: +; GCN: buffer_load_dword v[[LO:[0-9]+]] +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = lshr i64 %val, 32 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lshr_and_i64_35: +; XGCN: buffer_load_dword [[VAL:v[0-9]+]] +; XGCN: v_lshlrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]] +; XGCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} +; XGCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %and = and i64 %val, 2147483647 ; 0x7fffffff + %shl = lshr i64 %and, 35 + store i64 %shl, i64 addrspace(1)* %out + ret void +}