Index: clang/test/CodeGen/ms-intrinsics.c =================================================================== --- clang/test/CodeGen/ms-intrinsics.c +++ clang/test/CodeGen/ms-intrinsics.c @@ -499,13 +499,13 @@ int test_iso_volatile_load32(int volatile *p) { return __iso_volatile_load32(p); } __int64 test_iso_volatile_load64(__int64 volatile *p) { return __iso_volatile_load64(p); } -// CHECK: define{{.*}}i8 @test_iso_volatile_load8(i8*{{[a-z_ ]*}}%p) +// CHECK: define{{.*}}i8 @test_iso_volatile_load8(i8*{{[a-z0-9_() ]*}}%p) // CHECK: = load volatile i8, i8* %p -// CHECK: define{{.*}}i16 @test_iso_volatile_load16(i16*{{[a-z_ ]*}}%p) +// CHECK: define{{.*}}i16 @test_iso_volatile_load16(i16*{{[a-z0-9_() ]*}}%p) // CHECK: = load volatile i16, i16* %p -// CHECK: define{{.*}}i32 @test_iso_volatile_load32(i32*{{[a-z_ ]*}}%p) +// CHECK: define{{.*}}i32 @test_iso_volatile_load32(i32*{{[a-z0-9_() ]*}}%p) // CHECK: = load volatile i32, i32* %p -// CHECK: define{{.*}}i64 @test_iso_volatile_load64(i64*{{[a-z_ ]*}}%p) +// CHECK: define{{.*}}i64 @test_iso_volatile_load64(i64*{{[a-z0-9_() ]*}}%p) // CHECK: = load volatile i64, i64* %p void test_iso_volatile_store8(char volatile *p, char v) { __iso_volatile_store8(p, v); } Index: clang/test/CodeGen/ms-x86-intrinsics.c =================================================================== --- clang/test/CodeGen/ms-x86-intrinsics.c +++ clang/test/CodeGen/ms-x86-intrinsics.c @@ -120,7 +120,7 @@ __int64 *HighProduct) { return _mul128(Multiplier, Multiplicand, HighProduct); } -// CHECK-X64-LABEL: define dso_local i64 @test_mul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct) +// CHECK-X64-LABEL: define dso_local i64 @test_mul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z0-9()_ ]*}}%HighProduct) // CHECK-X64: = sext i64 %Multiplier to i128 // CHECK-X64: = sext i64 %Multiplicand to i128 // CHECK-X64: = mul nsw i128 % @@ -132,7 +132,7 @@ unsigned __int64 *HighProduct) { return _umul128(Multiplier, Multiplicand, HighProduct); } -// CHECK-X64-LABEL: define dso_local i64 @test_umul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z_ ]*}}%HighProduct) +// CHECK-X64-LABEL: define dso_local i64 @test_umul128(i64 %Multiplier, i64 %Multiplicand, i64*{{[a-z0-9()_ ]*}}%HighProduct) // CHECK-X64: = zext i64 %Multiplier to i128 // CHECK-X64: = zext i64 %Multiplicand to i128 // CHECK-X64: = mul nuw i128 % Index: clang/test/CodeGen/systemz-inline-asm.c =================================================================== --- clang/test/CodeGen/systemz-inline-asm.c +++ clang/test/CodeGen/systemz-inline-asm.c @@ -123,7 +123,7 @@ long double test_f128(long double f, long double g) { asm("axbr %0, %2" : "=f" (f) : "0" (f), "f" (g)); return f; -// CHECK: define void @test_f128(fp128* noalias nocapture sret [[DEST:%.*]], fp128* nocapture readonly, fp128* nocapture readonly) +// CHECK: define void @test_f128(fp128* noalias nocapture sret dereferenceable(16) [[DEST:%.*]], fp128* nocapture readonly dereferenceable(16), fp128* nocapture readonly dereferenceable(16)) // CHECK: %f = load fp128, fp128* %0 // CHECK: %g = load fp128, fp128* %1 // CHECK: [[RESULT:%.*]] = tail call fp128 asm "axbr $0, $2", "=f,0,f"(fp128 %f, fp128 %g) Index: clang/test/CodeGenOpenCL/kernels-have-spir-cc-by-default.cl =================================================================== --- clang/test/CodeGenOpenCL/kernels-have-spir-cc-by-default.cl +++ clang/test/CodeGenOpenCL/kernels-have-spir-cc-by-default.cl @@ -28,7 +28,7 @@ // CHECK: spir_kernel // AMDGCN: define amdgpu_kernel void @test_single // CHECK: struct.int_single* nocapture {{.*}} byval(%struct.int_single) -// CHECK: i32* nocapture %output +// CHECK: i32* nocapture dereferenceable(4) %output output[0] = input.a; } @@ -36,7 +36,7 @@ // CHECK: spir_kernel // AMDGCN: define amdgpu_kernel void @test_pair // CHECK: struct.int_pair* nocapture {{.*}} byval(%struct.int_pair) -// CHECK: i32* nocapture %output +// CHECK: i32* nocapture dereferenceable(8) %output output[0] = (int)input.a; output[1] = (int)input.b; } @@ -45,7 +45,7 @@ // CHECK: spir_kernel // AMDGCN: define amdgpu_kernel void @test_kernel // CHECK: struct.test_struct* nocapture {{.*}} byval(%struct.test_struct) -// CHECK: i32* nocapture %output +// CHECK: i32* nocapture dereferenceable(32) %output output[0] = input.elementA; output[1] = input.elementB; output[2] = (int)input.elementC; @@ -59,7 +59,7 @@ void test_function(int_pair input, global int* output) { // CHECK-NOT: spir_kernel // AMDGCN-NOT: define amdgpu_kernel void @test_function -// CHECK: i64 %input.coerce0, i64 %input.coerce1, i32* nocapture %output +// CHECK: i64 %input.coerce0, i64 %input.coerce1, i32* nocapture dereferenceable(8) %output output[0] = (int)input.a; output[1] = (int)input.b; } Index: llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp =================================================================== --- llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -8,6 +8,7 @@ #include "llvm/Transforms/IPO/InferFunctionAttrs.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -16,32 +17,159 @@ #include "llvm/Transforms/Utils/BuildLibCalls.h" using namespace llvm; +// TODO: Could use an LLVM set container, but requires sorting? +using SetOfOffsets = std::set; +using ArgToOffsetsMap = SmallDenseMap; + #define DEBUG_TYPE "inferattrs" -static bool inferAllPrototypeAttributes(Module &M, - const TargetLibraryInfo &TLI) { +// FIXME: This entire pass should be deprecated by making the "Attributor" pass +// handle these kinds of inferences. + +static void getArgToOffsetsMap(Function &F, ArgToOffsetsMap &ArgOffsetMap) { + // To apply a dereferenceable attribute to an argument based on a memory + // access in the function, the access must be guaranteed to execute every time + // the function is called. + // Conservatively, only check for memory ops in the entry block that are + // guaranteed to execute. + // TODO: This could be enhanced by testing if a memory access post-dominates + // the entry block (walking to/from the load). We can also check if a + // block is guaranteed to transfer execution to another block. + const DataLayout &DL = F.getParent()->getDataLayout(); + BasicBlock &Entry = F.getEntryBlock(); + for (Instruction &I : Entry) { + // Analyze pointer operands of any load/store instruction. + // TODO: Allow cmpxchg and atomicrmw opcodes. + // TODO: "isSimple()" excludes atomic ops, but some subset of those should + // be allowed. + Value *PtrOp = nullptr; + switch (I.getOpcode()) { + case Instruction::Load: { + auto *Load = cast(&I); + if (Load->isSimple()) + PtrOp = Load->getPointerOperand(); + break; + } + case Instruction::Store: { + auto *Store = cast(&I); + if (Store->isSimple()) + PtrOp = Store->getPointerOperand(); + break; + } + default: + break; + } + if (!PtrOp) { + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + return; + continue; + } + assert(isGuaranteedToTransferExecutionToSuccessor(&I) && + "Expected simple memory access to transfer execution"); + + // Decompose the pointer into base (which must be a function argument) and + // offset. Ignore negative offsets because the dereferenceable range must + // begin at the argument. + int64_t ByteOffset; + Value *Base = GetPointerBaseWithConstantOffset(PtrOp, ByteOffset, DL); + auto *Arg = dyn_cast(Base); + if (!Arg || ByteOffset < 0) + continue; + + // Make sure we have a pointer to a type that is a multiple of 8-bit bytes + // because the 'dereferenceable' attribute range is specified using bytes. + // TODO: We can handle weird bitwidths by rounding down. + assert(Arg->getType()->isPointerTy() && "Unexpected non-pointer type"); + Type *ArgEltType = cast(Arg->getType())->getElementType(); + unsigned ArgSizeInBits = ArgEltType->getPrimitiveSizeInBits(); + if (!ArgSizeInBits || ArgSizeInBits % 8 != 0) + continue; + + // TODO: This restriction can be removed, but that will make the range + // calculation more complicated. Instead of only tracking whole number + // offsets from the base, we have to track individual offsets and + // ranges (fractional and multiple offsets are possible via casts). + assert(isa(PtrOp->getType()) && "Expected pointer type"); + Type *AccessType = cast(PtrOp->getType())->getElementType(); + unsigned AccessSizeInBits = AccessType->getPrimitiveSizeInBits(); + if (AccessSizeInBits != ArgSizeInBits) + continue; + + assert((ByteOffset % (AccessSizeInBits / 8)) == 0 && + "Unexpected address offset calculation"); + SetOfOffsets &OffsetsForArg = ArgOffsetMap[Arg]; + OffsetsForArg.insert(ByteOffset / (AccessSizeInBits / 8)); + } +} + +static bool inferDereferenceableFromMemoryAccesses(Function &F) { + ArgToOffsetsMap ArgOffsetMap; + getArgToOffsetsMap(F, ArgOffsetMap); bool Changed = false; - for (Function &F : M.functions()) - // We only infer things using the prototype and the name; we don't need - // definitions. - if (F.isDeclaration() && !F.hasOptNone()) - Changed |= inferLibFuncAttributes(F, TLI); + // For any pointer argument that we matched with memory accesses... + for (auto &ArgAndOffsetPair : ArgOffsetMap) { + Argument *Arg = ArgAndOffsetPair.getFirst(); + SetOfOffsets &Offsets = ArgAndOffsetPair.getSecond(); + + // Determine how many consecutive memory accesses that we found. The set is + // sorted, so as soon as we miss an offset from the pointer, we are done. + // We do not know if a chunk of memory is dereferenceable without an access. + // TODO: See size limitation in getArgToOffsetsMap(). If we allow varying + // sizes of accesses from an argument, this will not be valid. + int64_t MaxOffset = 0; + for (int64_t Offset : Offsets) { + if (Offset != MaxOffset) + break; + ++MaxOffset; + } + // If there was no access directly from this pointer argument, give up. + // TODO: We could extend an existing known dereferenceable argument with + // extra bytes even if there are missing leading chunks. + if (!MaxOffset) + continue; + + auto *PtrTy = cast(Arg->getType()); + unsigned EltSize = PtrTy->getElementType()->getPrimitiveSizeInBits(); + uint64_t DerefBytes = MaxOffset * (EltSize / 8); + + // Replace existing dereferenceable attributes if we determined that more + // bytes are always accessed. + unsigned ArgNumber = Arg->getArgNo(); + if (F.getParamDereferenceableBytes(ArgNumber) < DerefBytes) { + F.removeParamAttr(ArgNumber, Attribute::Dereferenceable); + F.removeParamAttr(ArgNumber, Attribute::DereferenceableOrNull); + F.addDereferenceableParamAttr(ArgNumber, DerefBytes); + Changed = true; + } + } return Changed; } +static bool inferAttributes(Module &M, const TargetLibraryInfo &TLI) { + bool Changed = false; + + for (Function &F : M.functions()) { + if (F.hasOptNone()) + continue; + // For libfunc attributes, we infer things using the prototype and the name. + // For other attributes, we need to look at the function definition. + if (F.isDeclaration()) + Changed |= inferLibFuncAttributes(F, TLI); + else + Changed |= inferDereferenceableFromMemoryAccesses(F); + } + return Changed; +} + PreservedAnalyses InferFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) { + // If we may have changed fundamental function attributes, clear analyses. + // If we didn't infer anything, preserve all analyses. auto &TLI = AM.getResult(M); - - if (!inferAllPrototypeAttributes(M, TLI)) - // If we didn't infer anything, preserve all analyses. - return PreservedAnalyses::all(); - - // Otherwise, we may have changed fundamental function attributes, so clear - // out all the passes. - return PreservedAnalyses::none(); + return inferAttributes(M, TLI) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } namespace { @@ -61,7 +189,7 @@ return false; auto &TLI = getAnalysis().getTLI(); - return inferAllPrototypeAttributes(M, TLI); + return inferAttributes(M, TLI); } }; } Index: llvm/test/CodeGen/AMDGPU/inline-attr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/inline-attr.ll +++ llvm/test/CodeGen/AMDGPU/inline-attr.ll @@ -3,7 +3,7 @@ ; RUN: opt -mtriple=amdgcn--amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck -check-prefix=GCN -check-prefix=NOINFS %s ; GCN: define float @foo(float %x) local_unnamed_addr #0 { -; GCN: define amdgpu_kernel void @caller(float addrspace(1)* nocapture %p) local_unnamed_addr #1 { +; GCN: define amdgpu_kernel void @caller(float addrspace(1)* nocapture dereferenceable(4) %p) local_unnamed_addr #1 { ; GCN: %mul.i = fmul float %load, 1.500000e+01 ; UNSAFE: attributes #0 = { norecurse nounwind readnone "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } Index: llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll =================================================================== --- llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll +++ llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll @@ -1,10 +1,11 @@ ; RUN: opt < %s -inferattrs -S | FileCheck %s +; RUN: opt < %s -passes=inferattrs -S | FileCheck %s ; Determine dereference-ability before unused loads get deleted: ; https://bugs.llvm.org/show_bug.cgi?id=21780 define <4 x double> @PR21780(double* %ptr) { -; CHECK-LABEL: @PR21780(double* %ptr) +; CHECK-LABEL: @PR21780(double* dereferenceable(32) %ptr) ; GEP of index 0 is simplified away. %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1 %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 @@ -23,10 +24,10 @@ ret <4 x double> %shuffle } -; Unsimplified, but still valid. Also, throw in some bogus arguments. +; Unsimplified, but still valid. Also, throw in a bogus argument and a store argument. define void @gep0(i8* %unused, i8* %other, i8* %ptr) { -; CHECK-LABEL: @gep0(i8* %unused, i8* %other, i8* %ptr) +; CHECK-LABEL: @gep0(i8* %unused, i8* dereferenceable(1) %other, i8* dereferenceable(3) %ptr) %arrayidx0 = getelementptr i8, i8* %ptr, i64 0 %arrayidx1 = getelementptr i8, i8* %ptr, i64 1 %arrayidx2 = getelementptr i8, i8* %ptr, i64 2 @@ -41,7 +42,7 @@ ; Multiple arguments may be dereferenceable. define void @ordering(i8* %ptr1, i32* %ptr2) { -; CHECK-LABEL: @ordering(i8* %ptr1, i32* %ptr2) +; CHECK-LABEL: @ordering(i8* dereferenceable(3) %ptr1, i32* dereferenceable(8) %ptr2) %a20 = getelementptr i32, i32* %ptr2, i64 0 %a12 = getelementptr i8, i8* %ptr1, i64 2 %t12 = load i8, i8* %a12 @@ -71,7 +72,7 @@ ret void } -; Not in entry block and not guaranteed to execute. +; Negative test - not in entry block and not guaranteed to execute. define void @not_entry_not_guaranteed_to_execute(i8* %ptr, i1 %cond) { ; CHECK-LABEL: @not_entry_not_guaranteed_to_execute(i8* %ptr, i1 %cond) @@ -92,7 +93,7 @@ ; The last load may not execute, so derefenceable bytes only covers the 1st two loads. define void @partial_in_entry(i16* %ptr, i1 %cond) { -; CHECK-LABEL: @partial_in_entry(i16* %ptr, i1 %cond) +; CHECK-LABEL: @partial_in_entry(i16* dereferenceable(4) %ptr, i1 %cond) entry: %arrayidx0 = getelementptr i16, i16* %ptr, i64 0 %arrayidx1 = getelementptr i16, i16* %ptr, i64 1 @@ -121,10 +122,23 @@ ret void } +; TODO: We should allow inference for atomic (but not volatile) ops. + +define void @atomic_is_alright(i16* %ptr) { +; CHECK-LABEL: @atomic_is_alright(i16* %ptr) + %arrayidx0 = getelementptr i16, i16* %ptr, i64 0 + %arrayidx1 = getelementptr i16, i16* %ptr, i64 1 + %arrayidx2 = getelementptr i16, i16* %ptr, i64 2 + %t0 = load atomic i16, i16* %arrayidx0 unordered, align 2 + %t1 = load i16, i16* %arrayidx1 + %t2 = load i16, i16* %arrayidx2 + ret void +} + declare void @may_not_return() define void @not_guaranteed_to_transfer_execution(i16* %ptr) { -; CHECK-LABEL: @not_guaranteed_to_transfer_execution(i16* %ptr) +; CHECK-LABEL: @not_guaranteed_to_transfer_execution(i16* dereferenceable(2) %ptr) %arrayidx0 = getelementptr i16, i16* %ptr, i64 0 %arrayidx1 = getelementptr i16, i16* %ptr, i64 1 %arrayidx2 = getelementptr i16, i16* %ptr, i64 2 @@ -138,7 +152,7 @@ ; We must have consecutive accesses. define void @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index) { -; CHECK-LABEL: @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index) +; CHECK-LABEL: @variable_gep_index(i8* %unused, i8* dereferenceable(1) %ptr, i64 %variable_index) %arrayidx1 = getelementptr i8, i8* %ptr, i64 %variable_index %arrayidx2 = getelementptr i8, i8* %ptr, i64 2 %t0 = load i8, i8* %ptr @@ -147,7 +161,7 @@ ret void } -; Deal with >1 GEP index. +; TODO: Deal with >1 GEP index. define void @multi_index_gep(<4 x i8>* %ptr) { ; CHECK-LABEL: @multi_index_gep(<4 x i8>* %ptr) @@ -156,7 +170,7 @@ ret void } -; Could round weird bitwidths down? +; TODO: Could round weird bitwidths down? define void @not_byte_multiple(i9* %ptr) { ; CHECK-LABEL: @not_byte_multiple(i9* %ptr) @@ -165,7 +179,7 @@ ret void } -; Missing direct access from the pointer. +; Negative test - missing direct access from the pointer. define void @no_pointer_deref(i16* %ptr) { ; CHECK-LABEL: @no_pointer_deref(i16* %ptr) @@ -179,7 +193,7 @@ ; Out-of-order is ok, but missing access concludes dereferenceable range. define void @non_consecutive(i32* %ptr) { -; CHECK-LABEL: @non_consecutive(i32* %ptr) +; CHECK-LABEL: @non_consecutive(i32* dereferenceable(8) %ptr) %arrayidx1 = getelementptr i32, i32* %ptr, i64 1 %arrayidx0 = getelementptr i32, i32* %ptr, i64 0 %arrayidx3 = getelementptr i32, i32* %ptr, i64 3 @@ -192,7 +206,22 @@ ; Improve on existing dereferenceable attribute. define void @more_bytes(i32* dereferenceable(8) %ptr) { -; CHECK-LABEL: @more_bytes(i32* dereferenceable(8) %ptr) +; CHECK-LABEL: @more_bytes(i32* dereferenceable(16) %ptr) + %arrayidx3 = getelementptr i32, i32* %ptr, i64 3 + %arrayidx1 = getelementptr i32, i32* %ptr, i64 1 + %arrayidx0 = getelementptr i32, i32* %ptr, i64 0 + %arrayidx2 = getelementptr i32, i32* %ptr, i64 2 + %t3 = load i32, i32* %arrayidx3 + %t1 = load i32, i32* %arrayidx1 + %t2 = load i32, i32* %arrayidx2 + %t0 = load i32, i32* %arrayidx0 + ret void +} + +; Improve on existing dereferenceable_or_null attribute. + +define void @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr) { +; CHECK-LABEL: @more_bytes_and_not_null(i32* dereferenceable(16) %ptr) %arrayidx3 = getelementptr i32, i32* %ptr, i64 3 %arrayidx1 = getelementptr i32, i32* %ptr, i64 1 %arrayidx0 = getelementptr i32, i32* %ptr, i64 0 @@ -204,7 +233,7 @@ ret void } -; But don't pessimize existing dereferenceable attribute. +; Negative test - don't pessimize existing dereferenceable attribute. define void @better_bytes(i32* dereferenceable(100) %ptr) { ; CHECK-LABEL: @better_bytes(i32* dereferenceable(100) %ptr) @@ -219,8 +248,10 @@ ret void } +; Peeking through same-size-element bitcast is supported. + define void @bitcast(i32* %arg) { -; CHECK-LABEL: @bitcast(i32* %arg) +; CHECK-LABEL: @bitcast(i32* dereferenceable(8) %arg) %ptr = bitcast i32* %arg to float* %arrayidx0 = getelementptr float, float* %ptr, i64 0 %arrayidx1 = getelementptr float, float* %ptr, i64 1 @@ -229,6 +260,8 @@ ret void } +; TODO: Enhance to allow arbitrary sub-ranges. + define void @bitcast_different_sizes(double* %arg1, i8* %arg2) { ; CHECK-LABEL: @bitcast_different_sizes(double* %arg1, i8* %arg2) %ptr1 = bitcast double* %arg1 to float* @@ -247,8 +280,10 @@ ret void } +; The attribute has a length, not a range, so can't represent this better. + define void @negative_offset(i32* %arg) { -; CHECK-LABEL: @negative_offset(i32* %arg) +; CHECK-LABEL: @negative_offset(i32* dereferenceable(4) %arg) %ptr = bitcast i32* %arg to float* %arrayidx0 = getelementptr float, float* %ptr, i64 0 %arrayidx1 = getelementptr float, float* %ptr, i64 -1 @@ -257,8 +292,10 @@ ret void } +; Simple store accesses allow inferring too. + define void @stores(i32* %arg) { -; CHECK-LABEL: @stores(i32* %arg) +; CHECK-LABEL: @stores(i32* dereferenceable(8) %arg) %ptr = bitcast i32* %arg to float* %arrayidx0 = getelementptr float, float* %ptr, i64 0 %arrayidx1 = getelementptr float, float* %ptr, i64 1 @@ -267,8 +304,10 @@ ret void } +; Combinations of load/store can be used together. + define void @load_store(i32* %arg) { -; CHECK-LABEL: @load_store(i32* %arg) +; CHECK-LABEL: @load_store(i32* dereferenceable(8) %arg) %ptr = bitcast i32* %arg to float* %arrayidx0 = getelementptr float, float* %ptr, i64 0 %arrayidx1 = getelementptr float, float* %ptr, i64 1