diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9749,29 +9749,6 @@ return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID)); } - if (BuiltinID == clang::AArch64::BI__builtin_arm_prefetch) { - Value *Address = EmitScalarExpr(E->getArg(0)); - Value *RW = EmitScalarExpr(E->getArg(1)); - Value *CacheLevel = EmitScalarExpr(E->getArg(2)); - Value *RetentionPolicy = EmitScalarExpr(E->getArg(3)); - Value *IsData = EmitScalarExpr(E->getArg(4)); - - Value *Locality = nullptr; - if (cast(RetentionPolicy)->isZero()) { - // Temporal fetch, needs to convert cache level to locality. - Locality = llvm::ConstantInt::get(Int32Ty, - -cast(CacheLevel)->getValue() + 3); - } else { - // Streaming fetch. - Locality = llvm::ConstantInt::get(Int32Ty, 0); - } - - // FIXME: We need AArch64 specific LLVM intrinsic if we want to specify - // PLDL3STRM or PLDL2STRM. - Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType()); - return Builder.CreateCall(F, {Address, RW, Locality, IsData}); - } - if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) { assert((getContext().getTypeSize(E->getType()) == 32) && "rbit of unusual size!"); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -3226,9 +3226,9 @@ if (BuiltinID == AArch64::BI__builtin_arm_prefetch) { return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1) || - SemaBuiltinConstantArgRange(TheCall, 2, 0, 2) || - SemaBuiltinConstantArgRange(TheCall, 3, 0, 1) || - SemaBuiltinConstantArgRange(TheCall, 4, 0, 1); + SemaBuiltinConstantArgRange(TheCall, 2, 0, 3) || + SemaBuiltinConstantArgRange(TheCall, 3, 0, 1) || + SemaBuiltinConstantArgRange(TheCall, 4, 0, 1); } if (BuiltinID == AArch64::BI__builtin_arm_rsr64 || diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c --- a/clang/test/CodeGen/arm_acle.c +++ b/clang/test/CodeGen/arm_acle.c @@ -168,10 +168,15 @@ /* 8.6 Memory prefetch intrinsics */ /* 8.6.1 Data prefetch */ -// ARM-LABEL: @test_pld( -// ARM-NEXT: entry: -// ARM-NEXT: call void @llvm.prefetch.p0(ptr null, i32 0, i32 3, i32 1) -// ARM-NEXT: ret void +// AArch32-LABEL: @test_pld( +// AArch32-NEXT: entry: +// AArch32-NEXT: call void @llvm.prefetch.p0(ptr null, i32 0, i32 3, i32 1) +// AArch32-NEXT: ret void +// +// AArch64-LABEL: @test_pld( +// AArch64-NEXT: entry: +// AArch64-NEXT: call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 0, i32 1) +// AArch64-NEXT: ret void // void test_pld() { __pld(0); @@ -184,7 +189,7 @@ // // AArch64-LABEL: @test_pldx( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.prefetch.p0(ptr null, i32 1, i32 1, i32 1) +// AArch64-NEXT: call void @llvm.aarch64.prefetch(ptr null, i32 1, i32 2, i32 0, i32 1) // AArch64-NEXT: ret void // void test_pldx() { @@ -192,10 +197,15 @@ } /* 8.6.2 Instruction prefetch */ -// ARM-LABEL: @test_pli( -// ARM-NEXT: entry: -// ARM-NEXT: call void @llvm.prefetch.p0(ptr null, i32 0, i32 3, i32 0) -// ARM-NEXT: ret void +// AArch32-LABEL: @test_pli( +// AArch32-NEXT: entry: +// AArch32-NEXT: call void @llvm.prefetch.p0(ptr null, i32 0, i32 3, i32 0) +// AArch32-NEXT: ret void +// +// AArch64-LABEL: @test_pli( +// AArch64-NEXT: entry: +// AArch64-NEXT: call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 0, i32 0) +// AArch64-NEXT: ret void // void test_pli() { __pli(0); @@ -208,7 +218,7 @@ // // AArch64-LABEL: @test_plix( // AArch64-NEXT: entry: -// AArch64-NEXT: call void @llvm.prefetch.p0(ptr null, i32 0, i32 1, i32 0) +// AArch64-NEXT: call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 2, i32 0, i32 0) // AArch64-NEXT: ret void // void test_plix() { diff --git a/clang/test/CodeGen/builtins-arm64.c b/clang/test/CodeGen/builtins-arm64.c --- a/clang/test/CodeGen/builtins-arm64.c +++ b/clang/test/CodeGen/builtins-arm64.c @@ -47,16 +47,19 @@ void prefetch(void) { __builtin_arm_prefetch(0, 1, 2, 0, 1); // pstl3keep - // CHECK: call {{.*}} @llvm.prefetch.p0(ptr null, i32 1, i32 1, i32 1) + // CHECK: call {{.*}} @llvm.aarch64.prefetch(ptr null, i32 1, i32 2, i32 0, i32 1) __builtin_arm_prefetch(0, 0, 0, 1, 1); // pldl1keep - // CHECK: call {{.*}} @llvm.prefetch.p0(ptr null, i32 0, i32 0, i32 1) + // CHECK: call {{.*}} @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 1, i32 1) __builtin_arm_prefetch(0, 0, 0, 1, 1); // pldl1strm - // CHECK: call {{.*}} @llvm.prefetch.p0(ptr null, i32 0, i32 0, i32 1) + // CHECK: call {{.*}} @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 1, i32 1) __builtin_arm_prefetch(0, 0, 0, 0, 0); // plil1keep - // CHECK: call {{.*}} @llvm.prefetch.p0(ptr null, i32 0, i32 3, i32 0) + // CHECK: call {{.*}} @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 0, i32 0) + + __builtin_arm_prefetch(0, 0, 3, 0, 1); // pldslckeep + // CHECK: call {{.*}} @llvm.aarch64.prefetch(ptr null, i32 0, i32 3, i32 0, i32 1) } __attribute__((target("v8.5a"))) diff --git a/clang/test/Sema/builtins-arm64.c b/clang/test/Sema/builtins-arm64.c --- a/clang/test/Sema/builtins-arm64.c +++ b/clang/test/Sema/builtins-arm64.c @@ -25,7 +25,7 @@ void test_prefetch(void) { __builtin_arm_prefetch(0, 2, 0, 0, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}} - __builtin_arm_prefetch(0, 0, 3, 0, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}} + __builtin_arm_prefetch(0, 0, 4, 0, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}} __builtin_arm_prefetch(0, 0, 0, 2, 0); // expected-error-re {{argument value {{.*}} is outside the valid range}} __builtin_arm_prefetch(0, 0, 0, 0, 2); // expected-error-re {{argument value {{.*}} is outside the valid range}} } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -65,6 +65,14 @@ def int_aarch64_break : Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrNoReturn, IntrCold, ImmArg>]>; + +def int_aarch64_prefetch : Intrinsic<[], + [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, ReadOnly>, + ImmArg>, ImmArg>, ImmArg>, ImmArg> + ]>, + ClangBuiltin<"__builtin_arm_prefetch">; + //===----------------------------------------------------------------------===// // Data Barrier Instructions diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5754,6 +5754,17 @@ &Call); break; } + case Intrinsic::aarch64_prefetch: { + Check(cast(Call.getArgOperand(1))->getZExtValue() < 2, + "write argument to llvm.aarch64.prefetch must be 0 or 1", Call); + Check(cast(Call.getArgOperand(2))->getZExtValue() < 4, + "target argument to llvm.aarch64.prefetch must be 0-3", Call); + Check(cast(Call.getArgOperand(3))->getZExtValue() < 2, + "stream argument to llvm.aarch64.prefetch must be 0 or 1", Call); + Check(cast(Call.getArgOperand(4))->getZExtValue() < 2, + "isdata argument to llvm.aarch64.prefetch must be 0 or 1", Call); + break; + } }; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -956,6 +956,7 @@ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization(const CallLoweringInfo &CLI) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1189,9 +1189,6 @@ } } - if (Subtarget->hasSME()) - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (Subtarget->hasSVE()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) { setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -1513,6 +1510,8 @@ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); } + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); IsStrictFPEnabled = true; @@ -4713,6 +4712,44 @@ return std::nullopt; } +SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + SDLoc DL(Op); + switch (IntNo) { + default: + return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_prefetch: { + SDValue Chain = Op.getOperand(0); + SDValue Addr = Op.getOperand(2); + + unsigned IsWrite = cast(Op.getOperand(3))->getZExtValue(); + unsigned Locality = cast(Op.getOperand(4))->getZExtValue(); + unsigned IsStream = cast(Op.getOperand(5))->getZExtValue(); + unsigned IsData = cast(Op.getOperand(6))->getZExtValue(); + unsigned PrfOp = (IsWrite << 4) | // Load/Store bit + (!IsData << 3) | // IsDataCache bit + (Locality << 1) | // Cache level bits + (unsigned)IsStream; // Stream bit + + return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, + DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); + } + case Intrinsic::aarch64_sme_za_enable: + return DAG.getNode( + AArch64ISD::SMSTART, DL, MVT::Other, + Op->getOperand(0), // Chain + DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sme_za_disable: + return DAG.getNode( + AArch64ISD::SMSTOP, DL, MVT::Other, + Op->getOperand(0), // Chain + DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); + } +} + SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -4743,18 +4780,6 @@ // changed. return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL); } - case Intrinsic::aarch64_sme_za_enable: - return DAG.getNode( - AArch64ISD::SMSTART, DL, MVT::Other, - Op->getOperand(0), // Chain - DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); - case Intrinsic::aarch64_sme_za_disable: - return DAG.getNode( - AArch64ISD::SMSTOP, DL, MVT::Other, - Op->getOperand(0), // Chain - DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64)); } } @@ -5861,11 +5886,12 @@ return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED); case ISD::MULHU: return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED); - case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); case ISD::ATOMIC_STORE: if (cast(Op)->getMemoryVT() == MVT::i128) { assert(Subtarget->hasLSE2()); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1063,6 +1063,24 @@ MI.eraseFromParent(); return true; } + case Intrinsic::aarch64_prefetch: { + MachineIRBuilder MIB(MI); + auto &AddrVal = MI.getOperand(1); + + int64_t IsWrite = MI.getOperand(2).getImm(); + int64_t Target = MI.getOperand(3).getImm(); + int64_t IsStream = MI.getOperand(4).getImm(); + int64_t IsData = MI.getOperand(5).getImm(); + + unsigned PrfOp = (IsWrite << 4) | // Load/Store bit + (!IsData << 3) | // IsDataCache bit + (Target << 1) | // Cache level bits + (unsigned)IsStream; // Stream bit + + MIB.buildInstr(AArch64::G_PREFETCH).addImm(PrfOp).add(AddrVal); + MI.eraseFromParent(); + return true; + } } return true; diff --git a/llvm/test/CodeGen/AArch64/arm64-prefetch-new.ll b/llvm/test/CodeGen/AArch64/arm64-prefetch-new.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-prefetch-new.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=aarch64 -mattr=+v8.9a --global-isel=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 -mattr=+v8.9a --global-isel=1 --global-isel-abort=1 < %s | FileCheck %s + +@a = internal global ptr null, align 8 +@b = external global ptr, align 8 + +define void @test(ptr %i, i32 %j) nounwind ssp { +entry: + ; CHECK-LABEL: @test + %j.addr = alloca i32, align 4 + store i32 %j, ptr %j.addr, align 4, !tbaa !0 + %tmp = bitcast ptr %j.addr to ptr + + %i.next = getelementptr i8, ptr %i, i64 2 + + ; Verify prefetching works for all the different kinds of pointers we might + ; want to prefetch. + + ; CHECK: prfm pldl1keep, + call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 0, i32 1) + + ; CHECK: prfum pldl1keep, + call void @llvm.aarch64.prefetch(ptr %tmp, i32 0, i32 0, i32 0, i32 1) + + ; CHECK: prfm pldl1keep, + call void @llvm.aarch64.prefetch(ptr %i, i32 0, i32 0, i32 0, i32 1) + + ; CHECK: prfum pldl1keep, + call void @llvm.aarch64.prefetch(ptr %i.next, i32 0, i32 0, i32 0, i32 1) + + ; CHECK: prfm pldl1keep, + call void @llvm.aarch64.prefetch(ptr @a, i32 0, i32 0, i32 0, i32 1) + + ; CHECK: prfm pldl1keep, + call void @llvm.aarch64.prefetch(ptr @b, i32 0, i32 0, i32 0, i32 1) + + ; Verify that we can generate every single valid prefetch value. + + ; CHECK: prfm pstl1keep, + call void @llvm.aarch64.prefetch(ptr null, i32 1, i32 0, i32 0, i32 1) + + ; CHECK: prfm pldl2keep, + call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 1, i32 0, i32 1) + + ; CHECK: prfm pldl3keep, + call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 2, i32 0, i32 1) + + ; CHECK: prfm pldslckeep, + call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 3, i32 0, i32 1) + + ; CHECK: prfm pldl1strm, + call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 1, i32 1) + + ; CHECK: prfm plil1keep, + call void @llvm.aarch64.prefetch(ptr null, i32 0, i32 0, i32 0, i32 0) + + ret void +} + +declare void @llvm.aarch64.prefetch(ptr readonly, i32 immarg, i32 immarg, i32 immarg, i32 immarg) #0 + +attributes #0 = { inaccessiblemem_or_argmemonly nounwind willreturn } + +!0 = !{!"int", !1} +!1 = !{!"omnipotent char", !2} +!2 = !{!"Simple C/C++ TBAA"} +!3 = !{!"any pointer", !1}