Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2546,12 +2546,15 @@ // i64 (shl x, 32) -> (build_pair 0, x) - // Doing this with moves theoretically helps MI optimizations that understand - // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as - // v_lshl_b64. In the SALU case, I think this is slightly worse since it - // doubles the code size and I'm unsure about cycle count. + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. const ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); - if (!RHS || RHS->getZExtValue() != 32) + if (!RHS) + return SDValue(); + + unsigned RHSVal = RHS->getZExtValue(); + if (RHSVal < 32) return SDValue(); SDValue LHS = N->getOperand(0); @@ -2559,12 +2562,14 @@ SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - // Extract low 32-bits. + SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Zero, Lo); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Zero, NewShift); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } Index: test/CodeGen/AMDGPU/shift-i64-opts.ll =================================================================== --- test/CodeGen/AMDGPU/shift-i64-opts.ll +++ test/CodeGen/AMDGPU/shift-i64-opts.ll @@ -62,3 +62,50 @@ store i64 %shl, i64 addrspace(1)* %out ret void } + +; lshl (i64 x), c: c > 32 => reg_sequence lshl 0, (i32 lo_32(x)), (c - 32) + +; GCN-LABEL: {{^}}shl_i64_const_35: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]] +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = shl i64 %val, 35 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}shl_i64_const_32: +; GCN: buffer_load_dword v[[HI:[0-9]+]] +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = shl i64 %val, 32 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}shl_i64_const_63: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} +define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = shl i64 %val, 63 + store i64 %shl, i64 addrspace(1)* %out + ret void +} + +; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x) + +; GCN-LABEL: {{^}}ashr_i64_const_gt_32: +define void @ashr_i64_const_gt_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %val = load i64, i64 addrspace(1)* %in + %shl = ashr i64 %val, 35 + store i64 %shl, i64 addrspace(1)* %out + ret void +}