Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -53,6 +54,7 @@ DivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; + AMDGPUAS AMDGPUASI; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -123,6 +125,16 @@ /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + /// \brief Widen a scalar load. + /// + /// \details \p For uniform, small type loads from constant memory, we need + // check whether the loadinst alignment equals to or larger than 4 bytes and it's + // not a volatile memory and datatype bitwidth is less than 32 bits. If those + // conditions are satisfied, then do a widen scalar load. + // + /// \returns True. + + bool widenScalarLoadtoI32(LoadInst &I) const; public: static char ID; @@ -133,6 +145,7 @@ bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); + bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); @@ -223,6 +236,16 @@ } } +bool AMDGPUCodeGenPrepare::widenScalarLoadtoI32(LoadInst &I) const { + Type *Ty = I.getType(); + const DataLayout &DL = Mod->getDataLayout(); + int TySize = DL.getTypeSizeInBits(Ty); + unsigned Align = I.getAlignment() ? + I.getAlignment() : DL.getABITypeAlignment(Ty); + + return !I.isVolatile() && TySize < 32 && Align >= 4 && DA->isUniform(&I); +} + bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); @@ -441,6 +464,29 @@ return Changed; } +bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { + if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + widenScalarLoadtoI32(I)) { + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = Builder.getInt32Ty(); + Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); + Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); + Value *WidenLoad = Builder.CreateLoad(BitCast); + + int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); + Type *IntNTy = Builder.getIntNTy(TySize); + Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); + Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); + I.replaceAllUsesWith(ValOrig); + I.eraseFromParent(); + return true; + } + + return false; +} + bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { bool Changed = false; Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -519,7 +519,7 @@ } ; SI-LABEL: {{^}}constant_align4_load_i8: -; SI: buffer_load_ubyte +; SI: s_load_dword ; SI: buffer_store_byte define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 4 @@ -528,7 +528,7 @@ } ; SI-LABEL: {{^}}constant_align2_load_i8: -; SI: buffer_load_ubyte +; SI: s_load_dword ; SI: buffer_store_byte define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 2 Index: test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll @@ -0,0 +1,192 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s + +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +; GCN-LABEL: @constant_load_i1 +; GCN: load i1 +; GCN-NEXT: store i1 +define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { + %val = load i1, i1 addrspace(2)* %in + store i1 %val, i1 addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_i1_align2 +; GCN: load i1 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { + %val = load i1, i1 addrspace(2)* %in, align 2 + store i1 %val, i1 addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: @constant_load_i1_align4 +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { + %val = load i1, i1 addrspace(2)* %in, align 4 + store i1 %val, i1 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: @constant_load_i8 +; GCN: load i8 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_i8_align2 +; GCN: load i8 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in, align 2 + store i8 %val, i8 addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: @constant_load_i8align4 +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in, align 4 + store i8 %val, i8 addrspace(1)* %out, align 4 + ret void +} + + +; GCN-LABEL: @constant_load_v2i8 +; GCN: load <2 x i8> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_v2i8_align4 +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: bitcast +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4 + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: @constant_load_v3i8 +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: bitcast +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { + %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in + store <3 x i8> %ld, <3 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_v3i8_align4 +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: bitcast +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { + %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4 + store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: @constant_load_i16 +; GCN: load i16 +; GCN: sext +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { + %ld = load i16, i16 addrspace(2)* %in + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_i16_align4 +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: sext +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { + %ld = load i16, i16 addrspace(2)* %in, align 4 + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: @constant_load_f16 +; GCN: load half +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(2)* %in) #0 { + %ld = load half, half addrspace(2)* %in + store half %ld, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_v2f16 +; GCN: load <2 x half> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 { + %ld = load <2 x half>, <2 x half> addrspace(2)* %in + store <2 x half> %ld, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: @load_volatile +; GCN: load volatile i16 +; GCN-NEXT: store +define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { + %a = load volatile i16, i16 addrspace(2)* %in + store i16 %a, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_v2i8_volatile +; GCN: load volatile <2 x i8> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { + %ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: @constant_load_v2i8_addrespace1 +; GCN: load <2 x i8> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8_addrespace1(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: @use_dispatch_ptr +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: zext +; GCN-NEXT: store +define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4 + %ld = zext i8 %val to i32 + store i32 %ld, i32 addrspace(1)* %ptr + ret void +} + +attributes #0 = { nounwind }