Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -34,6 +34,9 @@ /// compare. SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; +public: + static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + protected: const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -127,6 +127,29 @@ return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } +bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) +{ + assert(Op.getOpcode() == ISD::OR); + + SDValue N0 = Op->getOperand(0); + SDValue N1 = Op->getOperand(1); + EVT VT = N0.getValueType(); + + if (VT.isInteger() && !VT.isVector()) { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(N0, LHSKnown); + + if (LHSKnown.Zero.getBoolValue()) { + DAG.computeKnownBits(N1, RHSKnown); + + if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) + return true; + } + } + + return false; +} + AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); - if (VT != MVT::i64) - return SDValue(); ConstantSDNode *RHS = dyn_cast(N->getOperand(1)); if (!RHS) @@ -2618,6 +2639,8 @@ case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { // shl (ext x) => zext (shl x), if shift does not overflow int + if (VT != MVT::i64) + break; KnownBits Known; SDValue X = LHS->getOperand(0); DAG.computeKnownBits(X, Known); @@ -2628,7 +2651,22 @@ SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } + case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break; + case ISD::ADD: { // Fall through from above + // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) + if (ConstantSDNode *C2 = dyn_cast(LHS->getOperand(1))) { + SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), + SDValue(RHS, 0)); + SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, + SDLoc(C2), VT); + return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); + } + break; } + } + + if (VT != MVT::i64) + return SDValue(); // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) Index: llvm/trunk/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s + +; Check transformation shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) +; Only one shift if expected, GEP shall not produce a separate shift + +; CHECK-LABEL: {{^}}add_const_offset: +; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0 +; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 0xc80, v[[SHL]] +; CHECK-NOT: v_lshl +; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]] +; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @add_const_offset(i32 addrspace(1)* nocapture %arg) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = add i32 %id, 200 + %shl = shl i32 %add, 2 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +; CHECK-LABEL: {{^}}or_const_offset: +; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0 +; CHECK: v_or_b32_e32 v[[OR:[0-9]+]], 0x1000, v[[SHL]] +; CHECK-NOT: v_lshl +; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]] +; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: +define amdgpu_kernel void @or_const_offset(i32 addrspace(1)* nocapture %arg) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %add = or i32 %id, 256 + %shl = shl i32 %add, 2 + %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl + %val = load i32, i32 addrspace(1)* %ptr, align 4 + store i32 %val, i32 addrspace(1)* %arg, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()