Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -87,6 +87,8 @@ /// Return 64-bit value Op as two 32-bit integers. std::pair split64BitValue(SDValue Op, SelectionDAG &DAG) const; + SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; + SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; /// \brief Split a vector load into a scalar load of each component. SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -378,6 +378,7 @@ setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); @@ -1193,6 +1194,22 @@ return std::make_pair(Lo, Hi); } +SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); +} + +SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); +} + SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast(Op); @@ -2626,6 +2643,43 @@ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } +SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); + if (!RHS) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + unsigned RHSVal = RHS->getZExtValue(); + + // sra i64:x, 32 -> build_pair x, sra hi_32(x), 31 + if (RHSVal == 32) { + SDValue Hi = getHiHalf64(N->getOperand(0), DAG); + SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + + SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + Hi, NewShift); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); + } + + // sra i64:x, 63 -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) + if (RHSVal == 63) { + SDValue Hi = getHiHalf64(N->getOperand(0), DAG); + SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + NewShift, NewShift); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); + } + + return SDValue(); +} + SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (N->getValueType(0) != MVT::i64) @@ -2804,6 +2858,12 @@ return performSrlCombine(N, DCI); } + case ISD::SRA: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performSraCombine(N, DCI); + } case ISD::AND: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -105,10 +105,18 @@ ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x) -; GCN-LABEL: {{^}}ashr_i64_const_gt_32: -define void @ashr_i64_const_gt_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { +; GCN-LABEL: {{^}}ashr_i64_const_32: +define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %val = load i64, i64 addrspace(1)* %in - %shl = ashr i64 %val, 35 + %shl = ashr i64 %val, 32 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ashr_i64_const_63: +define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = ashr i64 %val, 63 store i64 %shl, i64 addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/sint_to_fp.i64.ll =================================================================== --- test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -13,8 +13,7 @@ ; FUNC-LABEL: {{^}}v_sint_to_fp_i64_to_f32: ; GCN: {{buffer|flat}}_load_dwordx2 -; SI: v_ashr_i64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 63 -; VI: v_ashrrev_i64 {{v\[[0-9]+:[0-9]+\]}}, 63, {{v\[[0-9]+:[0-9]+\]}} +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} ; GCN: v_xor_b32 ; GCN: v_ffbh_u32 Index: test/CodeGen/AMDGPU/sra.ll =================================================================== --- test/CodeGen/AMDGPU/sra.ll +++ test/CodeGen/AMDGPU/sra.ll @@ -201,10 +201,10 @@ } ; GCN-LABEL: {{^}}s_ashr_32_i64: -; GCN: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN: s_ashr_i64 [[SHIFT:s\[[0-9]+:[0-9]+\]]], [[VAL]], 32 -; GCN: s_add_u32 -; GCN: s_addc_u32 +; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 +; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}} +; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}} define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 32 %add = add i64 %result, %b @@ -213,10 +213,10 @@ } ; GCN-LABEL: {{^}}v_ashr_32_i64: -; GCN: {{buffer|flat}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; SI: v_ashr_i64 [[SHIFT:v\[[0-9]+:[0-9]+\]]], [[VAL]], 32 -; VI: v_ashrrev_i64 [[SHIFT:v\[[0-9]+:[0-9]+\]]], 32, [[VAL]] -; GCN: {{buffer|flat}}_store_dwordx2 [[SHIFT]] +; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; VI: flat_load_dword v[[HI:[0-9]+]] +; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] +; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[HI]]:[[SHIFT]]{{\]}} define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid @@ -228,9 +228,11 @@ } ; GCN-LABEL: {{^}}s_ashr_63_i64: -; GCN-DAG: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN: s_ashr_i64 [[SHIFT:s\[[0-9]+:[0-9]+\]]], [[VAL]], 63 -; GCN: s_add_u32 +; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} +; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31 +; GCN: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]] +; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}} +; GCN: s_addc_u32 {{s[0-9]+}}, s[[COPYSHIFT]], {{s[0-9]+}} define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { %result = ashr i64 %a, 63 %add = add i64 %result, %b @@ -239,10 +241,11 @@ } ; GCN-LABEL: {{^}}v_ashr_63_i64: -; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] -; SI: v_ashr_i64 [[SHIFT:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63 -; VI: v_ashrrev_i64 [[SHIFT:v\[[0-9]+:[0-9]+\]]], 63, [[VAL]] -; GCN: {{buffer|flat}}_store_dwordx2 [[SHIFT]] +; SI: buffer_load_dword v[[HI:[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; VI: flat_load_dword v[[HI:[0-9]+]] +; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]] +; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]] +; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[SHIFT]]:[[COPY]]{{\]}} define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid