Index: clang/include/clang/Basic/TargetInfo.h =================================================================== --- clang/include/clang/Basic/TargetInfo.h +++ clang/include/clang/Basic/TargetInfo.h @@ -585,6 +585,12 @@ return false; } + /// Determine whether an aggregate output operand of inline assembly is + /// compatible with register constraints on this target. + virtual bool canStoreAggregateOperandInRegister(unsigned size) const { + return false; + } + /// Determine whether _Float16 is supported on this target. virtual bool hasLegalHalfType() const { return HasLegalHalfType; } Index: clang/lib/Basic/Targets/AArch64.h =================================================================== --- clang/lib/Basic/Targets/AArch64.h +++ clang/lib/Basic/Targets/AArch64.h @@ -141,6 +141,10 @@ bool hasInt128Type() const override; bool hasExtIntType() const override { return true; } + + bool canStoreAggregateOperandInRegister(unsigned size) const override { + return size == 512 && HasLS64; + } }; class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo { Index: clang/lib/Basic/Targets/AArch64.cpp =================================================================== --- clang/lib/Basic/Targets/AArch64.cpp +++ clang/lib/Basic/Targets/AArch64.cpp @@ -751,6 +751,9 @@ if (Size == 64) return true; + if (Size == 512) + return HasLS64; + SuggestedModifier = "w"; return false; } Index: clang/lib/CodeGen/CGStmt.cpp =================================================================== --- clang/lib/CodeGen/CGStmt.cpp +++ clang/lib/CodeGen/CGStmt.cpp @@ -2287,15 +2287,25 @@ // by-value. If this is a memory result, return the value by-reference. bool isScalarizableAggregate = hasAggregateEvaluationKind(OutExpr->getType()); - if (!Info.allowsMemory() && (hasScalarEvaluationKind(OutExpr->getType()) || - isScalarizableAggregate)) { + + unsigned Size = getContext().getTypeSize(OutExpr->getType()); + + // If this is a register output but the asm operand is of aggregate + // type, then make the inline asm return it by-reference and let + // the target deal with it when possible. + bool byRef = Info.allowsRegister() && isScalarizableAggregate && + getTarget().canStoreAggregateOperandInRegister(Size); + + bool byVal = !Info.allowsMemory() && + (hasScalarEvaluationKind(OutExpr->getType()) || isScalarizableAggregate); + + if (byVal && !byRef) { Constraints += "=" + OutputConstraint; ResultRegQualTys.push_back(OutExpr->getType()); ResultRegDests.push_back(Dest); ResultTruncRegTypes.push_back(ConvertTypeForMem(OutExpr->getType())); if (Info.allowsRegister() && isScalarizableAggregate) { ResultTypeRequiresCast.push_back(true); - unsigned Size = getContext().getTypeSize(OutExpr->getType()); llvm::Type *ConvTy = llvm::IntegerType::get(getLLVMContext(), Size); ResultRegTypes.push_back(ConvTy); } else { Index: clang/test/CodeGen/aarch64-ls64-inline-asm.c =================================================================== --- clang/test/CodeGen/aarch64-ls64-inline-asm.c +++ clang/test/CodeGen/aarch64-ls64-inline-asm.c @@ -0,0 +1,37 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck %s + +struct foo { unsigned long long x[8]; }; + +// CHECK-LABEL: @load( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUTPUT_ADDR:%.*]] = alloca %struct.foo*, align 8 +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store %struct.foo* [[OUTPUT:%.*]], %struct.foo** [[OUTPUT_ADDR]], align 8 +// CHECK-NEXT: store i8* [[ADDR:%.*]], i8** [[ADDR_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.foo*, %struct.foo** [[OUTPUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i8*, i8** [[ADDR_ADDR]], align 8 +// CHECK-NEXT: call void asm sideeffect "ld64b $0,[$1]", "=*r,r,~{memory}"(%struct.foo* [[TMP0]], i8* [[TMP1]]) #[[ATTR1:[0-9]+]], !srcloc !6 +// CHECK-NEXT: ret void +// +void load(struct foo *output, void *addr) +{ + __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory"); +} + +// CHECK-LABEL: @store( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[INPUT_ADDR:%.*]] = alloca %struct.foo*, align 8 +// CHECK-NEXT: [[ADDR_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: store %struct.foo* [[INPUT:%.*]], %struct.foo** [[INPUT_ADDR]], align 8 +// CHECK-NEXT: store i8* [[ADDR:%.*]], i8** [[ADDR_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.foo*, %struct.foo** [[INPUT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i8*, i8** [[ADDR_ADDR]], align 8 +// CHECK-NEXT: call void asm sideeffect "st64b $0,[$1]", "*r,r,~{memory}"(%struct.foo* [[TMP0]], i8* [[TMP1]]) #[[ATTR1]], !srcloc !7 +// CHECK-NEXT: ret void +// +void store(const struct foo *input, void *addr) +{ + __asm__ volatile ("st64b %0,[%1]" : : "r" (*input), "r" (addr) : "memory" ); +}