This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen] Initialize large arrays by copying from a global
ClosedPublic

Authored by kosarev on Feb 12 2018, 5:21 AM.

Download Raw Diff

Details

Reviewers

Commits

rGe0ef348cb9c5: [CodeGen] Initialize large arrays by copying from a global
rC325478: [CodeGen] Initialize large arrays by copying from a global
rL325478: [CodeGen] Initialize large arrays by copying from a global

Summary

Currently, clang compiles explicit initializers for array elements into series of store instructions. For large arrays of built-in types this results in bloated output code and significant amount of time spent on the instruction selection phase. This patch fixes the issue by initializing such arrays with global constants that store the binary image of the initializer.

Diff Detail

Event Timeline

kosarev created this revision.Feb 12 2018, 5:21 AM

Herald added a subscriber: llvm-commits. · View Herald TranscriptFeb 12 2018, 5:21 AM

rjmccall added inline comments.Feb 12 2018, 9:18 AM

lib/CodeGen/CGExprAgg.cpp
421	Is there a good reason to use an element-count heuristic instead of a total-size heuristic here? Why only builtin types? That seems to pointlessly rule out nested arrays, complex types, vectors, C structs, and so on. I think the predicate you probably want here is isTriviallyCopyableType.

kosarev added inline comments.Feb 13 2018, 7:54 AM

lib/CodeGen/CGExprAgg.cpp
421	Is there a good reason to use an element-count heuristic instead of a total-size heuristic here? Yes, the code below generates per-element initialization only for explicitly specified initializers. The rest, if any, is initialized with a filler, so it doesn't affect the size of the resulting code much.

Improved as suggested to cover all trivially-copyable types.

rjmccall added inline comments.Feb 13 2018, 10:48 AM

lib/CodeGen/CGExprAgg.cpp
421	That makes sense, but you could still base it on the total size being initialized with explicit initializers. Such initializers, even when constant, are likely to require code size basically proportionate to the number of bytes initialized — sizes of immediate operands and all that.

Updated to consider the total size of the explicit initializers instead of their number. The threshold value is adjusted respectively.

LGTM, thanks!

This revision is now accepted and ready to land.Feb 16 2018, 9:58 AM

Closed by commit rL325478: [CodeGen] Initialize large arrays by copying from a global (authored by kosarev). · Explain WhyFeb 19 2018, 1:52 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

CodeGen/

CGExprAgg.cpp

36 lines

test/

CodeGen/

init.c

23 lines

Diff 134590

lib/CodeGen/CGExprAgg.cpp

//===--- CGExprAgg.cpp - Emit LLVM Code from Aggregate Expressions --------===//		//===--- CGExprAgg.cpp - Emit LLVM Code from Aggregate Expressions --------===//
//		//
// The LLVM Compiler Infrastructure		// The LLVM Compiler Infrastructure
//		//
// This file is distributed under the University of Illinois Open Source		// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.		// License. See LICENSE.TXT for details.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This contains code to emit Aggregate Expr nodes as LLVM code.		// This contains code to emit Aggregate Expr nodes as LLVM code.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "CodeGenFunction.h"		#include "CodeGenFunction.h"
#include "CGObjCRuntime.h"		#include "CGObjCRuntime.h"
#include "CodeGenModule.h"		#include "CodeGenModule.h"
		#include "ConstantEmitter.h"
#include "clang/AST/ASTContext.h"		#include "clang/AST/ASTContext.h"
#include "clang/AST/DeclCXX.h"		#include "clang/AST/DeclCXX.h"
#include "clang/AST/DeclTemplate.h"		#include "clang/AST/DeclTemplate.h"
#include "clang/AST/StmtVisitor.h"		#include "clang/AST/StmtVisitor.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"		#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"		#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Intrinsics.h"		#include "llvm/IR/Intrinsics.h"
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	public:
void EmitFinalDestCopy(QualType type, const LValue &src);		void EmitFinalDestCopy(QualType type, const LValue &src);
void EmitFinalDestCopy(QualType type, RValue src);		void EmitFinalDestCopy(QualType type, RValue src);
void EmitCopy(QualType type, const AggValueSlot &dest,		void EmitCopy(QualType type, const AggValueSlot &dest,
const AggValueSlot &src);		const AggValueSlot &src);

void EmitMoveFromReturnSlot(const Expr *E, RValue Src);		void EmitMoveFromReturnSlot(const Expr *E, RValue Src);

void EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,		void EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
QualType elementType, InitListExpr *E);		QualType ArrayQTy, InitListExpr *E);

AggValueSlot::NeedsGCBarriers_t needsGC(QualType T) {		AggValueSlot::NeedsGCBarriers_t needsGC(QualType T) {
if (CGF.getLangOpts().getGC() && TypeRequiresGCollection(T))		if (CGF.getLangOpts().getGC() && TypeRequiresGCollection(T))
return AggValueSlot::NeedsGCBarriers;		return AggValueSlot::NeedsGCBarriers;
return AggValueSlot::DoesNotNeedGCBarriers;		return AggValueSlot::DoesNotNeedGCBarriers;
}		}

bool TypeRequiresGCollection(QualType T);		bool TypeRequiresGCollection(QualType T);
▲ Show 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	return Cons->getConstructor()->isDefaultConstructor() &&
Cons->getConstructor()->isTrivial();		Cons->getConstructor()->isTrivial();

// FIXME: Are there other cases where we can avoid emitting an initializer?		// FIXME: Are there other cases where we can avoid emitting an initializer?
return false;		return false;
}		}

/// \brief Emit initialization of an array from an initializer list.		/// \brief Emit initialization of an array from an initializer list.
void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,		void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
QualType elementType, InitListExpr *E) {		QualType ArrayQTy, InitListExpr *E) {
uint64_t NumInitElements = E->getNumInits();		uint64_t NumInitElements = E->getNumInits();

uint64_t NumArrayElements = AType->getNumElements();		uint64_t NumArrayElements = AType->getNumElements();
assert(NumInitElements <= NumArrayElements);		assert(NumInitElements <= NumArrayElements);

		QualType elementType =
		CGF.getContext().getAsArrayType(ArrayQTy)->getElementType();

// DestPtr is an array. Construct an elementType by drilling		// DestPtr is an array. Construct an elementType by drilling
// down a level.		// down a level.
llvm::Value *zero = llvm::ConstantInt::get(CGF.SizeTy, 0);		llvm::Value *zero = llvm::ConstantInt::get(CGF.SizeTy, 0);
llvm::Value *indices[] = { zero, zero };		llvm::Value *indices[] = { zero, zero };
llvm::Value *begin =		llvm::Value *begin =
Builder.CreateInBoundsGEP(DestPtr.getPointer(), indices, "arrayinit.begin");		Builder.CreateInBoundsGEP(DestPtr.getPointer(), indices, "arrayinit.begin");

CharUnits elementSize = CGF.getContext().getTypeSizeInChars(elementType);		CharUnits elementSize = CGF.getContext().getTypeSizeInChars(elementType);
CharUnits elementAlign =		CharUnits elementAlign =
DestPtr.getAlignment().alignmentOfArrayElement(elementSize);		DestPtr.getAlignment().alignmentOfArrayElement(elementSize);

		// Consider initializing the array by copying from a global. For this to be
		// more efficient than per-element initialization, the number of the elements
		// with explicit initializers should be large enough.
		if (NumInitElements * elementSize.getQuantity() > 16 &&
		rjmccallUnsubmitted Not Done Reply Inline Actions Is there a good reason to use an element-count heuristic instead of a total-size heuristic here? Why only builtin types? That seems to pointlessly rule out nested arrays, complex types, vectors, C structs, and so on. I think the predicate you probably want here is isTriviallyCopyableType. rjmccall: Is there a good reason to use an element-count heuristic instead of a total-size heuristic here?
		kosarevAuthorUnsubmitted Not Done Reply Inline Actions Is there a good reason to use an element-count heuristic instead of a total-size heuristic here? Yes, the code below generates per-element initialization only for explicitly specified initializers. The rest, if any, is initialized with a filler, so it doesn't affect the size of the resulting code much. kosarev: > Is there a good reason to use an element-count heuristic instead of a total-size heuristic…
		rjmccallUnsubmitted Not Done Reply Inline Actions That makes sense, but you could still base it on the total size being initialized with explicit initializers. Such initializers, even when constant, are likely to require code size basically proportionate to the number of bytes initialized — sizes of immediate operands and all that. rjmccall: That makes sense, but you could still base it on the total size being initialized with explicit…
		elementType.isTriviallyCopyableType(CGF.getContext())) {
		CodeGen::CodeGenModule &CGM = CGF.CGM;
		ConstantEmitter Emitter(CGM);
		LangAS AS = ArrayQTy.getAddressSpace();
		if (llvm::Constant *C = Emitter.tryEmitForInitializer(E, AS, ArrayQTy)) {
		auto GV = new llvm::GlobalVariable(
		CGM.getModule(), C->getType(),
		CGM.isTypeConstant(ArrayQTy, /* ExcludeCtorDtor= */ true),
		llvm::GlobalValue::PrivateLinkage, C, "constinit",
		/* InsertBefore= */ nullptr, llvm::GlobalVariable::NotThreadLocal,
		CGM.getContext().getTargetAddressSpace(AS));
		Emitter.finalize(GV);
		CharUnits Align = CGM.getContext().getTypeAlignInChars(ArrayQTy);
		GV->setAlignment(Align.getQuantity());
		EmitFinalDestCopy(ArrayQTy, CGF.MakeAddrLValue(GV, ArrayQTy, Align));
		return;
		}
		}

// Exception safety requires us to destroy all the		// Exception safety requires us to destroy all the
// already-constructed members if an initializer throws.		// already-constructed members if an initializer throws.
// For that, we'll need an EH cleanup.		// For that, we'll need an EH cleanup.
QualType::DestructionKind dtorKind = elementType.isDestructedType();		QualType::DestructionKind dtorKind = elementType.isDestructedType();
Address endOfInit = Address::invalid();		Address endOfInit = Address::invalid();
EHScopeStack::stable_iterator cleanup;		EHScopeStack::stable_iterator cleanup;
llvm::Instruction *cleanupDominator = nullptr;		llvm::Instruction *cleanupDominator = nullptr;
if (CGF.needsEHCleanup(dtorKind)) {		if (CGF.needsEHCleanup(dtorKind)) {
▲ Show 20 Lines • Show All 731 Lines • ▼ Show 20 Lines	if (E->isTransparent())
return Visit(E->getInit(0));		return Visit(E->getInit(0));

AggValueSlot Dest = EnsureSlot(E->getType());		AggValueSlot Dest = EnsureSlot(E->getType());

LValue DestLV = CGF.MakeAddrLValue(Dest.getAddress(), E->getType());		LValue DestLV = CGF.MakeAddrLValue(Dest.getAddress(), E->getType());

// Handle initialization of an array.		// Handle initialization of an array.
if (E->getType()->isArrayType()) {		if (E->getType()->isArrayType()) {
QualType elementType =
CGF.getContext().getAsArrayType(E->getType())->getElementType();

auto AType = cast<llvm::ArrayType>(Dest.getAddress().getElementType());		auto AType = cast<llvm::ArrayType>(Dest.getAddress().getElementType());
EmitArrayInit(Dest.getAddress(), AType, elementType, E);		EmitArrayInit(Dest.getAddress(), AType, E->getType(), E);
return;		return;
}		}

assert(E->getType()->isRecordType() && "Only support structs/unions here!");		assert(E->getType()->isRecordType() && "Only support structs/unions here!");

// Do struct initialization; this code just sets each individual member		// Do struct initialization; this code just sets each individual member
// to the approprate value. This makes bitfield support automatic;		// to the approprate value. This makes bitfield support automatic;
// the disadvantage is that the generated code is more difficult for		// the disadvantage is that the generated code is more difficult for
▲ Show 20 Lines • Show All 497 Lines • Show Last 20 Lines

test/CodeGen/init.c

	// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - \| FileCheck %s			// RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - \| FileCheck %s

	struct I { int k[3]; };			struct I { int k[3]; };
	struct M { struct I o[2]; };			struct M { struct I o[2]; };
	struct M v1[1] = { [0].o[0 ... 1].k[0 ... 1] = 4, 5 };			struct M v1[1] = { [0].o[0 ... 1].k[0 ... 1] = 4, 5 };
	unsigned v2[2][3] = {[0 ... 1][0 ... 1] = 2222, 3333};			unsigned v2[2][3] = {[0 ... 1][0 ... 1] = 2222, 3333};

	// CHECK-DAG: %struct.M = type { [2 x %struct.I] }			// CHECK-DAG: %struct.M = type { [2 x %struct.I] }
	// CHECK-DAG: %struct.I = type { [3 x i32] }			// CHECK-DAG: %struct.I = type { [3 x i32] }

	// CHECK: [1 x %struct.M] [%struct.M { [2 x %struct.I] [%struct.I { [3 x i32] [i32 4, i32 4, i32 0] }, %struct.I { [3 x i32] [i32 4, i32 4, i32 5] }] }],			// CHECK-DAG: [1 x %struct.M] [%struct.M { [2 x %struct.I] [%struct.I { [3 x i32] [i32 4, i32 4, i32 0] }, %struct.I { [3 x i32] [i32 4, i32 4, i32 5] }] }],
	// CHECK: [2 x [3 x i32]] {{[[][[]}}3 x i32] [i32 2222, i32 2222, i32 0], [3 x i32] [i32 2222, i32 2222, i32 3333]],			// CHECK-DAG: [2 x [3 x i32]] {{[[][[]}}3 x i32] [i32 2222, i32 2222, i32 0], [3 x i32] [i32 2222, i32 2222, i32 3333]],
				// CHECK-DAG: [[INIT14:.*]] = private global [16 x i32] [i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 0, i32 0, i32 0, i32 0], align 4

	void f1() {			void f1() {
	// Scalars in braces.			// Scalars in braces.
	int a = { 1 };			int a = { 1 };
	}			}

	void f2() {			void f2() {
	int a[2][2] = { { 1, 2 }, { 3, 4 } };			int a[2][2] = { { 1, 2 }, { 3, 4 } };
	int b[3][3] = { { 1, 2 }, { 3, 4 } };			int b[3][3] = { { 1, 2 }, { 3, 4 } };
	int *c[2] = { &a[1][1], &b[2][2] };			int *c[2] = { &a[1][1], &b[2][2] };
	int *d[2][2] = { {&a[1][1], &b[2][2]}, {&a[0][0], &b[1][1]} };			int *d[2][2] = { {&a[1][1], &b[2][2]}, {&a[0][0], &b[1][1]} };
	int *e[3][3] = { {&a[1][1], &b[2][2]}, {&a[0][0], &b[1][1]} };			int *e[3][3] = { {&a[1][1], &b[2][2]}, {&a[0][0], &b[1][1]} };
	char ext[3][3] = {".Y",".U",".V"};			char ext[3][3] = {".Y",".U",".V"};
	}			}

	typedef void (* F)(void);			typedef void (* F)(void);
	extern void foo(void);			extern void foo(void);
	struct S { F f; };			struct S { F f; };
	void f3() {			void f3() {
	struct S a[1] = { { foo } };			struct S a[1] = { { foo } };
	}			}

	// Constants			// Constants
	// CHECK: @g3 = constant i32 10			// CHECK-DAG: @g3 = constant i32 10
	// CHECK: @f4.g4 = internal constant i32 12			// CHECK-DAG: @f4.g4 = internal constant i32 12
	const int g3 = 10;			const int g3 = 10;
	int f4() {			int f4() {
	static const int g4 = 12;			static const int g4 = 12;
	return g4;			return g4;
	}			}

	// PR6537			// PR6537
	typedef union vec3 {			typedef union vec3 {
	Show All 10 Lines
	void f6() {			void f6() {
	int x;			int x;
	long ids[] = { (long) &x };			long ids[] = { (long) &x };
	}			}




	// CHECK: @test7 = global{{.*}}{ i32 0, [4 x i8] c"bar\00" }			// CHECK-DAG: @test7 = global{{.*}}{ i32 0, [4 x i8] c"bar\00" }
	// PR8217			// PR8217
	struct a7 {			struct a7 {
	int b;			int b;
	char v[];			char v[];
	};			};

	struct a7 test7 = { .b = 0, .v = "bar" };			struct a7 test7 = { .b = 0, .v = "bar" };

	▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines

	// CHECK-LABEL: @PR20473			// CHECK-LABEL: @PR20473
	void PR20473() {			void PR20473() {
	// CHECK: memcpy{{.}}getelementptr inbounds ([2 x i8], [2 x i8] @			// CHECK: memcpy{{.}}getelementptr inbounds ([2 x i8], [2 x i8] @
	bar((char[2]) {""});			bar((char[2]) {""});
	// CHECK: memcpy{{.}}getelementptr inbounds ([3 x i8], [3 x i8] @			// CHECK: memcpy{{.}}getelementptr inbounds ([3 x i8], [3 x i8] @
	bar((char[3]) {""});			bar((char[3]) {""});
	}			}

				// Test that we initialize large member arrays by copying from a global and not
				// with a series of stores.
				struct S14 { int a[16]; };

				void test14(struct S14 *s14) {
				// CHECK-LABEL: @test14
				// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 {{.}}, i8 align 4 {{.}} [[INIT14]] {{.}}, i32 64, i1 false)
				// CHECK-NOT: store
				// CHECK: ret void
				*s14 = (struct S14) { { [5 ... 11] = 17 } };
				}