diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -431,7 +431,8 @@ Feature == "sve2-aes" || Feature == "sve2-sha3" || Feature == "sve2-sm4" || Feature == "f64mm" || Feature == "f32mm" || Feature == "i8mm" || Feature == "bf16") && - (FPU & SveMode)); + (FPU & SveMode)) || + (Feature == "ls64" && HasLS64); } bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, @@ -752,6 +753,9 @@ if (Size == 64) return true; + if (Size == 512) + return HasLS64; + SuggestedModifier = "w"; return false; } diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -2097,7 +2097,8 @@ } else { llvm::Type *Ty = ConvertType(InputType); uint64_t Size = CGM.getDataLayout().getTypeSizeInBits(Ty); - if (Size <= 64 && llvm::isPowerOf2_64(Size)) { + if ((Size <= 64 && llvm::isPowerOf2_64(Size)) || + getTargetHooks().isScalarizableAsmOperand(*this, Ty)) { Ty = llvm::IntegerType::get(getLLVMContext(), Size); Ty = llvm::PointerType::getUnqual(Ty); @@ -2320,23 +2321,28 @@ // If this is a register output, then make the inline asm return it // by-value. If this is a memory result, return the value by-reference. - bool isScalarizableAggregate = - hasAggregateEvaluationKind(OutExpr->getType()); - if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) || - isScalarizableAggregate)) { + QualType QTy = OutExpr->getType(); + const bool IsScalarOrAggregate = hasScalarEvaluationKind(QTy) || + hasAggregateEvaluationKind(QTy); + if (!Info.allowsMemory() && IsScalarOrAggregate) { + Constraints += "=" + OutputConstraint; - ResultRegQualTys.push_back(OutExpr->getType()); + ResultRegQualTys.push_back(QTy); ResultRegDests.push_back(Dest); - ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType())); - if (Info.allowsRegister() && isScalarizableAggregate) { - ResultTypeRequiresCast.push_back(true); - unsigned Size = getContext().getTypeSize(OutExpr->getType()); - llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size); - ResultRegTypes.push_back(ConvTy); - } else { - ResultTypeRequiresCast.push_back(false); - ResultRegTypes.push_back(ResultTruncRegTypes.back()); + + llvm::Type *Ty = ConvertTypeForMem(QTy); + const bool RequiresCast = Info.allowsRegister() && + (getTargetHooks().isScalarizableAsmOperand(*this, Ty) || + Ty->isAggregateType()); + + ResultTruncRegTypes.push_back(Ty); + ResultTypeRequiresCast.push_back(RequiresCast); + + if (RequiresCast) { + unsigned Size = getContext().getTypeSize(QTy); + Ty = llvm::IntegerType::get(getLLVMContext(), Size); } + ResultRegTypes.push_back(Ty); // If this output is tied to an input, and if the input is larger, then // we need to set the actual result type of the inline asm node to be the // same as the input type. @@ -2638,11 +2644,11 @@ assert(ResultTypeRequiresCast.size() <= ResultRegDests.size()); for (unsigned i = 0, e = RegResults.size(); i != e; ++i) { llvm::Value *Tmp = RegResults[i]; + llvm::Type *TruncTy = ResultTruncRegTypes[i]; // If the result type of the LLVM IR asm doesn't match the result type of // the expression, do the conversion. if (ResultRegTypes[i] != ResultTruncRegTypes[i]) { - llvm::Type *TruncTy = ResultTruncRegTypes[i]; // Truncate the integer result to the right size, note that TruncTy can be // a pointer. @@ -2672,6 +2678,11 @@ unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]); Address A = Builder.CreateBitCast(Dest.getAddress(*this), ResultRegTypes[i]->getPointerTo()); + if (getTargetHooks().isScalarizableAsmOperand(*this, TruncTy)) { + Builder.CreateStore(Tmp, A); + continue; + } + QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false); if (Ty.isNull()) { const Expr *OutExpr = S.getOutputExpr(i); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -148,6 +148,13 @@ return Ty; } + /// Target hook to decide whether an inline asm operand can be passed + /// by value. + virtual bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF, + llvm::Type *Ty) const { + return false; + } + /// Adds constraints and types for result registers. virtual void addReturnRegisterOutputs( CodeGen::CodeGenFunction &CGF, CodeGen::LValue ReturnValue, diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -5526,6 +5526,20 @@ Fn->addFnAttr("branch-target-enforcement", BPI.BranchTargetEnforcement ? "true" : "false"); } + + bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF, + llvm::Type *Ty) const override { + if (CGF.getTarget().hasFeature("ls64")) { + auto *ST = dyn_cast(Ty); + if (ST && ST->getNumElements() == 1) { + auto *AT = dyn_cast(ST->getElementType(0)); + if (AT && AT->getNumElements() == 8 && + AT->getElementType()->isIntegerTy(64)) + return true; + } + } + return TargetCodeGenInfo::isScalarizableAsmOperand(CGF, Ty); + } }; class WindowsAArch64TargetCodeGenInfo : public AArch64TargetCodeGenInfo { diff --git a/clang/test/CodeGen/aarch64-ls64-inline-asm.c b/clang/test/CodeGen/aarch64-ls64-inline-asm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c @@ -0,0 +1,84 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -O1 -S -emit-llvm -x c %s -o - | FileCheck %s + +struct foo { unsigned long long x[8]; }; + +// CHECK-LABEL: @load( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(i8* [[ADDR:%.*]]) #[[ATTR1:[0-9]+]], !srcloc !6 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.foo* [[OUTPUT:%.*]] to i512* +// CHECK-NEXT: store i512 [[TMP0]], i512* [[TMP1]], align 8 +// CHECK-NEXT: ret void +// +void load(struct foo *output, void *addr) +{ + __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory"); +} + +// CHECK-LABEL: @store( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.foo* [[INPUT:%.*]] to i512* +// CHECK-NEXT: [[TMP1:%.*]] = load i512, i512* [[TMP0]], align 8 +// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !7 +// CHECK-NEXT: ret void +// +void store(const struct foo *input, void *addr) +{ + __asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" ); +} + +// CHECK-LABEL: @store2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[IN:%.*]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 1 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV2:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV5:%.*]] = sext i32 [[TMP2]] to i64 +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 16 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV8:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 25 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV11:%.*]] = sext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 36 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV14:%.*]] = sext i32 [[TMP5]] to i64 +// CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 49 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV17:%.*]] = sext i32 [[TMP6]] to i64 +// CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 64 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX19]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV20:%.*]] = sext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512 +// CHECK-NEXT: [[S_SROA_10_0_INSERT_SHIFT:%.*]] = shl nuw i512 [[S_SROA_10_0_INSERT_EXT]], 448 +// CHECK-NEXT: [[S_SROA_9_0_INSERT_EXT:%.*]] = zext i64 [[CONV17]] to i512 +// CHECK-NEXT: [[S_SROA_9_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_9_0_INSERT_EXT]], 384 +// CHECK-NEXT: [[S_SROA_9_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_10_0_INSERT_SHIFT]], [[S_SROA_9_0_INSERT_SHIFT]] +// CHECK-NEXT: [[S_SROA_8_0_INSERT_EXT:%.*]] = zext i64 [[CONV14]] to i512 +// CHECK-NEXT: [[S_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_8_0_INSERT_EXT]], 320 +// CHECK-NEXT: [[S_SROA_8_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_9_0_INSERT_INSERT]], [[S_SROA_8_0_INSERT_SHIFT]] +// CHECK-NEXT: [[S_SROA_7_0_INSERT_EXT:%.*]] = zext i64 [[CONV11]] to i512 +// CHECK-NEXT: [[S_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_7_0_INSERT_EXT]], 256 +// CHECK-NEXT: [[S_SROA_7_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_8_0_INSERT_INSERT]], [[S_SROA_7_0_INSERT_SHIFT]] +// CHECK-NEXT: [[S_SROA_6_0_INSERT_EXT:%.*]] = zext i64 [[CONV8]] to i512 +// CHECK-NEXT: [[S_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_6_0_INSERT_EXT]], 192 +// CHECK-NEXT: [[S_SROA_6_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_7_0_INSERT_INSERT]], [[S_SROA_6_0_INSERT_SHIFT]] +// CHECK-NEXT: [[S_SROA_5_0_INSERT_EXT:%.*]] = zext i64 [[CONV5]] to i512 +// CHECK-NEXT: [[S_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_5_0_INSERT_EXT]], 128 +// CHECK-NEXT: [[S_SROA_4_0_INSERT_EXT:%.*]] = zext i64 [[CONV2]] to i512 +// CHECK-NEXT: [[S_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i512 [[S_SROA_4_0_INSERT_EXT]], 64 +// CHECK-NEXT: [[S_SROA_4_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_6_0_INSERT_INSERT]], [[S_SROA_5_0_INSERT_SHIFT]] +// CHECK-NEXT: [[S_SROA_0_0_INSERT_EXT:%.*]] = zext i64 [[CONV]] to i512 +// CHECK-NEXT: [[S_SROA_0_0_INSERT_MASK:%.*]] = or i512 [[S_SROA_4_0_INSERT_MASK]], [[S_SROA_4_0_INSERT_SHIFT]] +// CHECK-NEXT: [[S_SROA_0_0_INSERT_INSERT:%.*]] = or i512 [[S_SROA_0_0_INSERT_MASK]], [[S_SROA_0_0_INSERT_EXT]] +// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[S_SROA_0_0_INSERT_INSERT]], i8* [[ADDR:%.*]]) #[[ATTR1]], !srcloc !12 +// CHECK-NEXT: ret void +// +void store2(int *in, void *addr) +{ + struct foo s = { in[0], in[1], in[4], in[16], in[25], in[36], in[49], in[64] }; + __asm__ volatile ("st64b %0,[%1]" : : "r" (s), "r" (addr) : "memory" ); +}