diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5010,6 +5010,14 @@ Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call); break; } + case Intrinsic::invariant_start: { + ConstantInt *InvariantSize = dyn_cast(Call.getArgOperand(0)); + Assert(InvariantSize && + (!InvariantSize->isNegative() || InvariantSize->isMinusOne()), + "invariant_start parameter must be -1, 0 or a positive number", + &Call); + break; + } case Intrinsic::matrix_multiply: case Intrinsic::matrix_transpose: case Intrinsic::matrix_column_major_load: diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -940,7 +940,19 @@ Loop *CurLoop) { Value *Addr = LI->getOperand(0); const DataLayout &DL = LI->getModule()->getDataLayout(); - const uint32_t LocSizeInBits = DL.getTypeSizeInBits(LI->getType()); + const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType()); + + // It is not currently possible for clang to generate an invariant.start + // intrinsic with scalable vector types because we don't support thread local + // sizeless types and we don't permit sizeless types in structs or classes. + // Furthermore, even if support is added for this in future the intrinsic + // itself is defined to have a size of -1 for variable sized objects. This + // makes it impossible to verify if the intrinsic envelops our region of + // interest. For example, both and + // types would have a -1 parameter, but the former is clearly double the size + // of the latter. + if (LocSizeInBits.isScalable()) + return false; // if the type is i8 addrspace(x)*, we know this is the type of // llvm.invariant.start operand @@ -970,13 +982,17 @@ if (!II || II->getIntrinsicID() != Intrinsic::invariant_start || !II->use_empty()) continue; - unsigned InvariantSizeInBits = - cast(II->getArgOperand(0))->getSExtValue() * 8; + ConstantInt *InvariantSize = cast(II->getArgOperand(0)); + // The intrinsic supports having a -1 argument for variable sized objects + // so we should check for that here. + if (InvariantSize->isNegative()) + continue; + uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8; // Confirm the invariant.start location size contains the load operand size // in bits. Also, the invariant.start should dominate the load, and we // should not hoist the load out of a loop that contains this dominating // invariant.start. - if (LocSizeInBits <= InvariantSizeInBits && + if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits && DT->properlyDominates(II->getParent(), CurLoop->getHeader())) return true; } diff --git a/llvm/test/Transforms/LICM/AArch64/lit.local.cfg b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AArch64' in config.root.targets: + config.unsupported = True diff --git a/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/sve-load-hoist.ll @@ -0,0 +1,30 @@ +; RUN: opt -licm -mtriple aarch64-linux-gnu -mattr=+sve -S < %s | FileCheck %s + +define void @no_hoist_load1_nxv2i64(* %out, i8* %in8, i32 %n) { +; CHECK-LABEL: @no_hoist_load1_nxv2i64( +; CHECK: entry: +; CHECK-NOT: load +; CHECK: for.body: +; CHECK: load +entry: + %cmp0 = icmp ugt i32 %n, 0 + %invst = call {}* @llvm.invariant.start.p0i8(i64 16, i8* %in8) + %in = bitcast i8* %in8 to * + br i1 %cmp0, label %for.body, label %for.end + +for.body: + %i = phi i32 [0, %entry], [%inc, %for.body] + %i2 = zext i32 %i to i64 + %ptr = getelementptr , * %out, i64 %i2 + %val = load , * %in, align 16 + store %val, * %ptr, align 16 + %inc = add nuw nsw i32 %i, 1 + %cmp = icmp ult i32 %inc, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret void +} + +declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly + diff --git a/llvm/test/Transforms/LICM/hoisting.ll b/llvm/test/Transforms/LICM/hoisting.ll --- a/llvm/test/Transforms/LICM/hoisting.ll +++ b/llvm/test/Transforms/LICM/hoisting.ll @@ -360,3 +360,36 @@ loopexit: ret i32 %sum } + +; We can't hoist the invariant load out of the loop because +; the marker is given a variable size (-1). +define i32 @test_fence5(i8* %addr, i32 %n, i8* %volatile) { +; CHECK-LABEL: @test_fence5 +; CHECK-LABEL: entry +; CHECK: invariant.start +; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8 +; CHECK: br label %loop +entry: + %gep = getelementptr inbounds i8, i8* %addr, i64 8 + %addr.i = bitcast i8* %gep to i32 * + store atomic i32 5, i32 * %addr.i unordered, align 8 + fence release + %invst = call {}* @llvm.invariant.start.p0i8(i64 -1, i8* %gep) + br label %loop + +loop: + %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ] + %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ] + %volload = load atomic i8, i8* %volatile unordered, align 8 + fence acquire + %volchk = icmp eq i8 %volload, 0 + %addrld = load atomic i32, i32* %addr.i unordered, align 8 + %sel = select i1 %volchk, i32 0, i32 %addrld + %sum.next = add i32 %sel, %sum + %indvar.next = add i32 %indvar, 1 + %cond = icmp slt i32 %indvar.next, %n + br i1 %cond, label %loop, label %loopexit + +loopexit: + ret i32 %sum +}