Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -118,6 +118,12 @@ unsigned getNumberOfRegisters(bool Vector) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; + unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const; + unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -234,12 +234,38 @@ return 32; } +unsigned AMDGPUTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const { + unsigned VecRegBitWidth = VF * LoadSize; + if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) + // TODO: Support element-size less than 32bit? + return 128 / LoadSize; + + return VF; +} + +unsigned AMDGPUTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const { + unsigned VecRegBitWidth = VF * StoreSize; + if (VecRegBitWidth > 128) + return 128 / StoreSize; + + return VF; +} + unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || - AddrSpace == AS.CONSTANT_ADDRESS_32BIT || - AddrSpace == AS.FLAT_ADDRESS) + AddrSpace == AS.CONSTANT_ADDRESS_32BIT) { + if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + return 128; + return 512; + } + + if (AddrSpace == AS.FLAT_ADDRESS) return 128; if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) @@ -250,8 +276,8 @@ if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && (AddrSpace == AS.PARAM_D_ADDRESS || AddrSpace == AS.PARAM_I_ADDRESS || - (AddrSpace >= AS.CONSTANT_BUFFER_0 && - AddrSpace <= AS.CONSTANT_BUFFER_15))) + (AddrSpace >= AS.CONSTANT_BUFFER_0 && + AddrSpace <= AS.CONSTANT_BUFFER_15))) return 128; llvm_unreachable("unhandled address space"); } Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3454,6 +3454,10 @@ return false; } +static bool isDwordAligned(unsigned Alignment) { + return Alignment % 4 == 0; +} + //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// @@ -5353,9 +5357,10 @@ assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Load->getAlignment())) { + AS, Alignment)) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -5383,7 +5388,8 @@ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && - !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && + isDwordAligned(Alignment)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private Index: llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -666,8 +666,12 @@ unsigned AS = Ptr->getType()->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast(Ty); + // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2) + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; // Make sure all the users of a vector are constant-index extracts. @@ -709,8 +713,12 @@ unsigned AS = Ptr->getType()->getPointerAddressSpace(); unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + unsigned VF = VecRegSize / TySize; + VectorType *VecTy = dyn_cast(Ty); + // No point in looking at these if they're too big to vectorize. - if (TySize > VecRegSize / 2) + if (TySize > VecRegSize / 2 || + (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0)) continue; if (isa(Ty) && !llvm::all_of(SI->users(), [](const User *U) { Index: llvm/trunk/test/CodeGen/AMDGPU/load-constant-f32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-constant-f32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-constant-f32.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; Tests whether a load chain of 8 constants gets vectorized into a wider load. +; FUNC-LABEL: {{^}}constant_load_v8f32: +; GCN: s_load_dwordx8 +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define amdgpu_kernel void @constant_load_v8f32(float addrspace(4)* noalias nocapture readonly %weights, float addrspace(1)* noalias nocapture %out_ptr) { +entry: + %out_ptr.promoted = load float, float addrspace(1)* %out_ptr, align 4 + %tmp = load float, float addrspace(4)* %weights, align 4 + %add = fadd float %tmp, %out_ptr.promoted + %arrayidx.1 = getelementptr inbounds float, float addrspace(4)* %weights, i64 1 + %tmp1 = load float, float addrspace(4)* %arrayidx.1, align 4 + %add.1 = fadd float %tmp1, %add + %arrayidx.2 = getelementptr inbounds float, float addrspace(4)* %weights, i64 2 + %tmp2 = load float, float addrspace(4)* %arrayidx.2, align 4 + %add.2 = fadd float %tmp2, %add.1 + %arrayidx.3 = getelementptr inbounds float, float addrspace(4)* %weights, i64 3 + %tmp3 = load float, float addrspace(4)* %arrayidx.3, align 4 + %add.3 = fadd float %tmp3, %add.2 + %arrayidx.4 = getelementptr inbounds float, float addrspace(4)* %weights, i64 4 + %tmp4 = load float, float addrspace(4)* %arrayidx.4, align 4 + %add.4 = fadd float %tmp4, %add.3 + %arrayidx.5 = getelementptr inbounds float, float addrspace(4)* %weights, i64 5 + %tmp5 = load float, float addrspace(4)* %arrayidx.5, align 4 + %add.5 = fadd float %tmp5, %add.4 + %arrayidx.6 = getelementptr inbounds float, float addrspace(4)* %weights, i64 6 + %tmp6 = load float, float addrspace(4)* %arrayidx.6, align 4 + %add.6 = fadd float %tmp6, %add.5 + %arrayidx.7 = getelementptr inbounds float, float addrspace(4)* %weights, i64 7 + %tmp7 = load float, float addrspace(4)* %arrayidx.7, align 4 + %add.7 = fadd float %tmp7, %add.6 + store float %add.7, float addrspace(1)* %out_ptr, align 4 + ret void +} \ No newline at end of file Index: llvm/trunk/test/CodeGen/AMDGPU/load-constant-f64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/load-constant-f64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -13,3 +13,36 @@ } attributes #0 = { nounwind } + +; Tests whether a load-chain of 8 constants of 64bit each gets vectorized into a wider load. +; FUNC-LABEL: {{^}}constant_load_2v4f64: +; GCN: s_load_dwordx16 +define amdgpu_kernel void @constant_load_2v4f64(double addrspace(4)* noalias nocapture readonly %weights, double addrspace(1)* noalias nocapture %out_ptr) { +entry: + %out_ptr.promoted = load double, double addrspace(1)* %out_ptr, align 4 + %tmp = load double, double addrspace(4)* %weights, align 4 + %add = fadd double %tmp, %out_ptr.promoted + %arrayidx.1 = getelementptr inbounds double, double addrspace(4)* %weights, i64 1 + %tmp1 = load double, double addrspace(4)* %arrayidx.1, align 4 + %add.1 = fadd double %tmp1, %add + %arrayidx.2 = getelementptr inbounds double, double addrspace(4)* %weights, i64 2 + %tmp2 = load double, double addrspace(4)* %arrayidx.2, align 4 + %add.2 = fadd double %tmp2, %add.1 + %arrayidx.3 = getelementptr inbounds double, double addrspace(4)* %weights, i64 3 + %tmp3 = load double, double addrspace(4)* %arrayidx.3, align 4 + %add.3 = fadd double %tmp3, %add.2 + %arrayidx.4 = getelementptr inbounds double, double addrspace(4)* %weights, i64 4 + %tmp4 = load double, double addrspace(4)* %arrayidx.4, align 4 + %add.4 = fadd double %tmp4, %add.3 + %arrayidx.5 = getelementptr inbounds double, double addrspace(4)* %weights, i64 5 + %tmp5 = load double, double addrspace(4)* %arrayidx.5, align 4 + %add.5 = fadd double %tmp5, %add.4 + %arrayidx.6 = getelementptr inbounds double, double addrspace(4)* %weights, i64 6 + %tmp6 = load double, double addrspace(4)* %arrayidx.6, align 4 + %add.6 = fadd double %tmp6, %add.5 + %arrayidx.7 = getelementptr inbounds double, double addrspace(4)* %weights, i64 7 + %tmp7 = load double, double addrspace(4)* %arrayidx.7, align 4 + %add.7 = fadd double %tmp7, %add.6 + store double %add.7, double addrspace(1)* %out_ptr, align 4 + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-looptest.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=GCN %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 | FileCheck --check-prefix=GCN %s ; Check that the waitcnt insertion algorithm correctly propagates wait counts ; from before a loop to the loop header.