Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -53,6 +54,7 @@ DivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; + AMDGPUAS AMDGPUASI; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -133,6 +135,7 @@ bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); + bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); @@ -441,6 +444,34 @@ return Changed; } +bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { + Type *Ty = I.getType(); + const DataLayout &DL = Mod->getDataLayout(); + int TySize = DL.getTypeSizeInBits(Ty); + unsigned Align = I.getAlignment() ? + I.getAlignment() : DL.getABITypeAlignment(Ty); + + if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + !I.isVolatile() && TySize < 32 && DA->isUniform(&I) && Align >= 4) { + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = Builder.getInt32Ty(); + Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); + Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); + Value *WidenLoad = Builder.CreateLoad(BitCast); + + Type *IntNTy = Builder.getIntNTy(TySize); + Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); + Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); + I.replaceAllUsesWith(ValOrig); + I.eraseFromParent(); + return true; + } + + return false; +} + bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { bool Changed = false; Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -519,7 +519,7 @@ } ; SI-LABEL: {{^}}constant_align4_load_i8: -; SI: buffer_load_ubyte +; SI: s_load_dword ; SI: buffer_store_byte define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 4 @@ -528,7 +528,7 @@ } ; SI-LABEL: {{^}}constant_align2_load_i8: -; SI: buffer_load_ubyte +; SI: s_load_dword ; SI: buffer_store_byte define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 { %v = load i8, i8 addrspace(2)* %p, align 2 Index: test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll @@ -0,0 +1,192 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s + +declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +; FUNC-LABEL: {{^}}constant_load_i1: +; GCN: load i1 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { + %val = load i1, i1 addrspace(2)* %in + store i1 %val, i1 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i1i_align2: +; GCN: load i1 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { + %val = load i1, i1 addrspace(2)* %in, align 2 + store i1 %val, i1 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i1i_align4: +; GCN: bitcast +; GCN-NEXT: load +; GCN-NEXT: trunc +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 { + %val = load i1, i1 addrspace(2)* %in, align 4 + store i1 %val, i1 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i8: +; GCN: load i8 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in + store i8 %val, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i8_align2: +; GCN: load i8 +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in, align 2 + store i8 %val, i8 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i8align4: +; GCN: bitcast +; GCN-NEXT: load +; GCN-NEXT: trunc +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 { + %val = load i8, i8 addrspace(2)* %in, align 4 + store i8 %val, i8 addrspace(1)* %out, align 4 + ret void +} + + +; FUNC-LABEL: {{^}}constant_load_v2i8: +; GCN: load <2 x i8> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v2i8i_align4: +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: bitcast +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4 + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v3i8: +; GCN: bitcast +; GCN-NEXT: load +; GCN-NEXT: trunc +; GCN-NEXT: bitcast +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { + %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in + store <3 x i8> %ld, <3 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v3i8_align4: +; GCN: bitcast +; GCN-NEXT: load +; GCN-NEXT: trunc +; GCN-NEXT: bitcast +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 { + %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4 + store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i16: +; GCN: load i16 +; GCN: sext +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { + %ld = load i16, i16 addrspace(2)* %in + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_i16_align4: +; GCN: bitcast +; GCN-NEXT: load +; GCN-NEXT: trunc +; GCN-NEXT: sext +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 { + %ld = load i16, i16 addrspace(2)* %in, align 4 + %ext = sext i16 %ld to i32 + store i32 %ext, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}constant_load_half: +; GCN: load half +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_half(half addrspace(1)* %out, half addrspace(2)* %in) #0 { + %ld = load half, half addrspace(2)* %in + store half %ld, half addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v2half: +; GCN: load <2 x half> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 { + %ld = load <2 x half>, <2 x half> addrspace(2)* %in + store <2 x half> %ld, <2 x half> addrspace(1)* %out + ret void +} + +; FUNC-LABEL:{{^}}load_volatile: +; GCN: load volatile +; GCN-NEXT: store +define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) { + %a = load volatile i16, i16 addrspace(2)* %in + store i16 %a, i16 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v2i8_volatile: +; GCN: load volatile +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { + %ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}constant_load_v2i8_addrespace1: +; GCN: load <2 x i8> +; GCN-NEXT: store +define amdgpu_kernel void @constant_load_v2i8_addrespace1(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { + %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in + store <2 x i8> %ld, <2 x i8> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}use_dispatch_ptr: +; GCN: bitcast +; GCN-NEXT: load i32 +; GCN-NEXT: trunc +; GCN-NEXT: zext +; GCN-NEXT: store +define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 { + %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() + %val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4 + %ld = zext i8 %val to i32 + store i32 %ld, i32 addrspace(1)* %ptr + ret void +} + +attributes #0 = { nounwind }