diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7456,9 +7456,33 @@ llvm_unreachable("invalid ext type"); } +static bool isDWORDAligned(SelectionDAG &DAG, MachineFunction &MF, SDValue Op) { + if (Op.getOpcode() == ISD::CopyFromReg) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + + Register VReg = cast(Op.getOperand(1))->getReg(); + MCRegister Reg = MRI.getLiveInPhysReg(VReg); + if (!Reg.isValid()) + return false; + + // Certain preloaded registers are quaranteed to be DWORD aligned. + const ArgDescriptor *AD; + const TargetRegisterClass *RC; + std::tie(AD, RC) = + MFI->getPreloadedValue(AMDGPUFunctionArgInfo::DISPATCH_PTR); + if (AD && AD->isRegister() && AD->getRegister() == Reg) + return true; + + return false; + } + KnownBits Known = DAG.computeKnownBits(Op); + return Known.countMinTrailingZeros() >= 2; +} + SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; - if (Ld->getAlignment() < 4 || Ld->isDivergent()) + if (Ld->isDivergent()) return SDValue(); // FIXME: Constant loads should all be marked invariant. @@ -7481,33 +7505,68 @@ assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && "unexpected vector extload"); - // TODO: Drop only high part of range. - SDValue Ptr = Ld->getBasePtr(); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - MVT::i32, SL, Ld->getChain(), Ptr, - Ld->getOffset(), - Ld->getPointerInfo(), MVT::i32, - Ld->getAlignment(), - Ld->getMemOperand()->getFlags(), - Ld->getAAInfo(), - nullptr); // Drop ranges - - EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); - if (MemVT.isFloatingPoint()) { - assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && - "unexpected fp extload"); - TruncVT = MemVT.changeTypeToInteger(); - } - - SDValue Cvt = NewLoad; - if (Ld->getExtensionType() == ISD::SEXTLOAD) { - Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, - DAG.getValueType(TruncVT)); - } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || - Ld->getExtensionType() == ISD::NON_EXTLOAD) { - Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); + SDValue NewLoad, Cvt; + if (Ld->getAlign() < 4) { + // Special handling on non-DWORD aligned loads. + // So far, only handle scalar loads only. + if (MemVT.isVector()) + return SDValue(); + // Skip non-naturally aligned loads. + if (Ld->getAlign() < MemVT.getStoreSize()) + return SDValue(); + // FIXME: Support other types. + if (MemVT != MVT::i16) + return SDValue(); + // For naturally aligned but not DWORD aligned load, try to widen if + // there's a constant offset from an aligned base. + SDValue Ptr = Ld->getBasePtr(); + if (!DAG.isBaseWithConstantOffset(Ptr)) + return SDValue(); + SDValue BasePtr = Ptr.getOperand(0); + if (!isDWORDAligned(DAG, DAG.getMachineFunction(), BasePtr)) + return SDValue(); + + EVT VT = Ptr.getValueType(); + int64_t Offset = cast(Ptr.getOperand(1))->getSExtValue(); + SDValue NewPtr = DAG.getNode(ISD::ADD, SL, VT, BasePtr, + DAG.getConstant(Offset - 2, SL, VT)); + // Now, the new load is DWORD aligned. + // TODO: Drop only low part of range. + NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, + Ld->getChain(), NewPtr, Ld->getOffset(), + Ld->getPointerInfo(), MVT::i32, Align(4), + Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), + nullptr); // Drop ranges + // Extract the high bits. + Cvt = DAG.getNode( + Ld->getExtensionType() == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, SL, + MVT::i32, NewLoad, DAG.getShiftAmountConstant(16, MVT::i32, SL)); } else { - assert(Ld->getExtensionType() == ISD::EXTLOAD); + SDValue Ptr = Ld->getBasePtr(); + // TODO: Drop only high part of range. + NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, + Ld->getChain(), Ptr, Ld->getOffset(), + Ld->getPointerInfo(), MVT::i32, Ld->getAlignment(), + Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), + nullptr); // Drop ranges + + EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + if (MemVT.isFloatingPoint()) { + assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && + "unexpected fp extload"); + TruncVT = MemVT.changeTypeToInteger(); + } + + if (Ld->getExtensionType() == ISD::SEXTLOAD) { + Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, + DAG.getValueType(TruncVT)); + } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || + Ld->getExtensionType() == ISD::NON_EXTLOAD) { + Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); + } else { + assert(Ld->getExtensionType() == ISD::EXTLOAD); + Cvt = NewLoad; + } } EVT VT = Ld->getValueType(0); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll @@ -14,6 +14,22 @@ ret void } +; GCN-LABEL: {{^}}test2 +; GCN: enable_sgpr_dispatch_ptr = 1 +; GCN: s_load_dword s[[REG:[0-9]+]], s[4:5], 0x1 +; GCN: s_lshr_b32 s{{[0-9]+}}, s[[REG]], 16 +; GCN-NOT: load_ushort +; GCN: s_endpgm +define amdgpu_kernel void @test2(i32 addrspace(1)* %out) { + %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 + %d1 = getelementptr inbounds i8, i8 addrspace(4)* %dispatch_ptr, i64 6 + %h1 = bitcast i8 addrspace(4)* %d1 to i16 addrspace(4)* + %v1 = load i16, i16 addrspace(4)* %h1 + %e1 = zext i16 %v1 to i32 + store i32 %e1, i32 addrspace(1)* %out + ret void +} + declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0 attributes #0 = { readnone }