Diff 177690

lib/CodeGen/SelectionDAG/SelectionDAG.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,391 Lines • ▼ Show 20 Lines	while (VTSize > Size) {
if (NewVT == MVT::i8)		if (NewVT == MVT::i8)
break;		break;
} while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));		} while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
}		}
NewVTSize = NewVT.getSizeInBits() / 8;		NewVTSize = NewVT.getSizeInBits() / 8;

// If the new VT cannot cover all of the remaining bits, then consider		// If the new VT cannot cover all of the remaining bits, then consider
// issuing a (or a pair of) unaligned and overlapping load / store.		// issuing a (or a pair of) unaligned and overlapping load / store.
// FIXME: Only does this for 64-bit or more since we don't have proper
// cost model for unaligned load / store.
bool Fast;		bool Fast;
if (NumMemOps && AllowOverlap &&		if (NumMemOps && AllowOverlap && NewVTSize < Size &&
		spatelUnsubmitted Done Reply Inline Actions clang-format should move that onto the previous line? spatel: clang-format should move that onto the previous line?
VTSize >= 8 && NewVTSize < Size &&		TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) &&
TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)		Fast)
VTSize = Size;		VTSize = Size;
else {		else {
VT = NewVT;		VT = NewVT;
VTSize = NewVTSize;		VTSize = NewVTSize;
}		}
}		}

if (++NumMemOps > Limit)		if (++NumMemOps > Limit)
▲ Show 20 Lines • Show All 3,687 Lines • Show Last 20 Lines

lib/Target/ARM/ARMLegalizerInfo.cpp

Show First 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
const LLT p0 = LLT::pointer(0, 32);		const LLT p0 = LLT::pointer(0, 32);

const LLT s1 = LLT::scalar(1);		const LLT s1 = LLT::scalar(1);
const LLT s8 = LLT::scalar(8);		const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);		const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);		const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);		const LLT s64 = LLT::scalar(64);

		if (ST.isThumb()) {
		// FIXME: merge with the code for non-Thumb.
		computeTables();
		verify(*ST.getInstrInfo());
		return;
		}

getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});		getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});		getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});

getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})		getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
.legalFor({s32})		.legalFor({s32})
.minScalar(0, s32);		.minScalar(0, s32);

if (ST.hasDivideInARMMode())		if (ST.hasDivideInARMMode())
▲ Show 20 Lines • Show All 354 Lines • Show Last 20 Lines

test/CodeGen/AArch64/arm64-memcpy-inline.ll

	Show All 10 Lines
	@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1			@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR \00", align 1
	@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1			@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
	@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1			@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
	@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16			@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16

	define i32 @t0() {			define i32 @t0() {
	entry:			entry:
	; CHECK-LABEL: t0:			; CHECK-LABEL: t0:
	; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]			; CHECK: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7]
	; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]			; CHECK: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7]
				pcordesUnsubmitted Not Done Reply Inline Actions It's normally best to do both loads before either store. (Like glibc's memcpy does). This allows the same code to work for memmove. But there are some microarchitectural advantages even without true overlap. If memory disambiguation on any AArch64 CPUs works like on Intel's x86 chips, if src and dst are offset by a multiple of 4k, the 2nd load will will be detected as possibly having a dependency on the unaligned store. (Because they overlap based on the low 12 bits of the address: offset within a page. The HW looks for partial matches first and then verifies because wider content-addressable memory in the store buffer would be expensive. Probably also because it starts checking before the TLB translation of the page-number bits is available.) https://software.intel.com/en-us/vtune-amplifier-help-4k-aliasing https://software.intel.com/sites/default/files/managed/04/59/TuningGuide_IntelXeonProcessor_ScalableFamily_1stGen.pdf describes the penalty on Intel SKX as ~7 cycle extra latency to replay the load, with potentially worse penalties when it involves a cache-line split for an unaligned load. (So there's a throughput cost from replaying the load up, as well as latency.) In-order uarches may benefit even more from doing both loads then both stores, hiding more of the latency. I think this is doing an aligned store for the first 8 bytes of the data being copied; that's good, the most likely consumer of a memcpy is an aligned load from the start of the buffer. Doing that store last allows store-forwarding to succeed, because all the data comes from one store. So probably we want to do the load of that data first, making a chain of memcpy efficient. e.g. for a 15-byte copy, we might want: ldr x10, [x0] ldur x11, [x0, #7] stur x11, [x1, #7] str x10, [x1] The equivalent of that on x86 is probably best for Intel and AMD's store-forwarding rules. pcordes: It's normally best to do both loads before either store. (Like glibc's memcpy does). This…
				courbetAuthorUnsubmitted Done Reply Inline Actions Thanks, I've filed PR39953. courbet: Thanks, I've filed PR39953.
	; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
	; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
	; CHECK: ldr [[REG2:x[0-9]+]],			; CHECK: ldr [[REG2:x[0-9]+]],
	; CHECK: str [[REG2]],			; CHECK: str [[REG2]],
	call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false)			call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false)
	ret i32 0			ret i32 0
	}			}

	define void @t1(i8* nocapture %C) nounwind {			define void @t1(i8* nocapture %C) nounwind {
	entry:			entry:
	Show All 38 Lines
	; CHECK: str [[REG6]], [x0]			; CHECK: str [[REG6]], [x0]
	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str4, i64 0, i64 0), i64 18, i1 false)			tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str4, i64 0, i64 0), i64 18, i1 false)
	ret void			ret void
	}			}

	define void @t5(i8* nocapture %C) nounwind {			define void @t5(i8* nocapture %C) nounwind {
	entry:			entry:
	; CHECK-LABEL: t5:			; CHECK-LABEL: t5:
	; CHECK: strb wzr, [x0, #6]			; CHECK: mov [[REG7:w[0-9]+]], #21337
	; CHECK: mov [[REG7:w[0-9]+]], #21587			; CHECK: movk [[REG7]],
	; CHECK: strh [[REG7]], [x0, #4]			; CHECK: stur [[REG7]], [x0, #3]
	; CHECK: mov [[REG8:w[0-9]+]],			; CHECK: mov [[REG8:w[0-9]+]],
	; CHECK: movk [[REG8]],			; CHECK: movk [[REG8]],
	; CHECK: str [[REG8]], [x0]			; CHECK: str [[REG8]], [x0]
	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i1 false)			tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i1 false)
	ret void			ret void
	}			}

	define void @t6() nounwind {			define void @t6() nounwind {
	Show All 25 Lines

test/CodeGen/PowerPC/jaggedstructs.ll

	Show All 28 Lines
	; CHECK-DAG: lbz {{[0-9]+}}, 175(1)			; CHECK-DAG: lbz {{[0-9]+}}, 175(1)
	; CHECK-DAG: lwz {{[0-9]+}}, 171(1)			; CHECK-DAG: lwz {{[0-9]+}}, 171(1)
	; CHECK-DAG: stb {{[0-9]+}}, 63(1)			; CHECK-DAG: stb {{[0-9]+}}, 63(1)
	; CHECK-DAG: stw {{[0-9]+}}, 59(1)			; CHECK-DAG: stw {{[0-9]+}}, 59(1)
	; CHECK-DAG: lhz {{[0-9]+}}, 182(1)			; CHECK-DAG: lhz {{[0-9]+}}, 182(1)
	; CHECK-DAG: lwz {{[0-9]+}}, 178(1)			; CHECK-DAG: lwz {{[0-9]+}}, 178(1)
	; CHECK-DAG: sth {{[0-9]+}}, 70(1)			; CHECK-DAG: sth {{[0-9]+}}, 70(1)
	; CHECK-DAG: stw {{[0-9]+}}, 66(1)			; CHECK-DAG: stw {{[0-9]+}}, 66(1)
	; CHECK-DAG: lbz {{[0-9]+}}, 191(1)			; CHECK-DAG: lwz {{[0-9]+}}, 188(1)
	; CHECK-DAG: lhz {{[0-9]+}}, 189(1)
	; CHECK-DAG: lwz {{[0-9]+}}, 185(1)			; CHECK-DAG: lwz {{[0-9]+}}, 185(1)
	; CHECK-DAG: stb {{[0-9]+}}, 79(1)			; CHECK-DAG: stw {{[0-9]+}}, 76(1)
	; CHECK-DAG: sth {{[0-9]+}}, 77(1)
	; CHECK-DAG: stw {{[0-9]+}}, 73(1)			; CHECK-DAG: stw {{[0-9]+}}, 73(1)
	; CHECK-DAG: ld 6, 72(1)			; CHECK-DAG: ld 6, 72(1)
	; CHECK-DAG: ld 5, 64(1)			; CHECK-DAG: ld 5, 64(1)
	; CHECK-DAG: ld 4, 56(1)			; CHECK-DAG: ld 4, 56(1)
	; CHECK-DAG: ld 3, 48(1)			; CHECK-DAG: ld 3, 48(1)

	declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)			declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)

test/CodeGen/PowerPC/structsinmem.ll

	Show First 20 Lines • Show All 151 Lines • ▼ Show 20 Lines
	; CHECK: sth {{[0-9]+}}, 126(1)			; CHECK: sth {{[0-9]+}}, 126(1)
	; CHECK: stb {{[0-9]+}}, 135(1)			; CHECK: stb {{[0-9]+}}, 135(1)
	; CHECK: sth {{[0-9]+}}, 133(1)			; CHECK: sth {{[0-9]+}}, 133(1)
	; CHECK: stw {{[0-9]+}}, 140(1)			; CHECK: stw {{[0-9]+}}, 140(1)
	; CHECK: stb {{[0-9]+}}, 151(1)			; CHECK: stb {{[0-9]+}}, 151(1)
	; CHECK: stw {{[0-9]+}}, 147(1)			; CHECK: stw {{[0-9]+}}, 147(1)
	; CHECK: sth {{[0-9]+}}, 158(1)			; CHECK: sth {{[0-9]+}}, 158(1)
	; CHECK: stw {{[0-9]+}}, 154(1)			; CHECK: stw {{[0-9]+}}, 154(1)
	; CHECK: stb {{[0-9]+}}, 167(1)			; CHECK: stw {{[0-9]+}}, 164(1)
	; CHECK: sth {{[0-9]+}}, 165(1)
	; CHECK: stw {{[0-9]+}}, 161(1)			; CHECK: stw {{[0-9]+}}, 161(1)
	}			}

	define internal i32 @callee2(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {			define internal i32 @callee2(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
	entry:			entry:
	%z1.addr = alloca i32, align 4			%z1.addr = alloca i32, align 4
	%z2.addr = alloca i32, align 4			%z2.addr = alloca i32, align 4
	%z3.addr = alloca i32, align 4			%z3.addr = alloca i32, align 4
	▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

test/CodeGen/PowerPC/structsinregs.ll

	Show First 20 Lines • Show All 142 Lines • ▼ Show 20 Lines

	; CHECK-LABEL: caller2			; CHECK-LABEL: caller2
	; CHECK: stb {{[0-9]+}}, 71(1)			; CHECK: stb {{[0-9]+}}, 71(1)
	; CHECK: sth {{[0-9]+}}, 69(1)			; CHECK: sth {{[0-9]+}}, 69(1)
	; CHECK: stb {{[0-9]+}}, 87(1)			; CHECK: stb {{[0-9]+}}, 87(1)
	; CHECK: stw {{[0-9]+}}, 83(1)			; CHECK: stw {{[0-9]+}}, 83(1)
	; CHECK: sth {{[0-9]+}}, 94(1)			; CHECK: sth {{[0-9]+}}, 94(1)
	; CHECK: stw {{[0-9]+}}, 90(1)			; CHECK: stw {{[0-9]+}}, 90(1)
	; CHECK: stb {{[0-9]+}}, 103(1)			; CHECK: stw {{[0-9]+}}, 100(1)
	; CHECK: sth {{[0-9]+}}, 101(1)
	; CHECK: stw {{[0-9]+}}, 97(1)			; CHECK: stw {{[0-9]+}}, 97(1)
	; CHECK: ld 9, 96(1)			; CHECK: ld 9, 96(1)
	; CHECK: ld 8, 88(1)			; CHECK: ld 8, 88(1)
	; CHECK: ld 7, 80(1)			; CHECK: ld 7, 80(1)
	; CHECK: lwz 6, 136(31)			; CHECK: lwz 6, 136(31)
	; CHECK: ld 5, 64(1)			; CHECK: ld 5, 64(1)
	; CHECK: lhz 4, 152(31)			; CHECK: lhz 4, 152(31)
	; CHECK: lbz 3, 160(31)			; CHECK: lbz 3, 160(31)
	▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

test/CodeGen/X86/memcpy-from-string.ll

	; RUN: llc < %s -asm-verbose=false \| FileCheck %s			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s \| FileCheck %s --check-prefix=X86

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	%0 = type { %1, i64, %2 }			%0 = type { %1, i64, %2 }
	%1 = type { i8* }			%1 = type { i8* }
	%2 = type { i64, [8 x i8] }			%2 = type { i64, [8 x i8] }

	@0 = internal constant [10 x i8] c"asdf jkl;\00", align 1			@0 = internal constant [10 x i8] c"asdf jkl;\00", align 1

	; Memcpy lowering should emit stores of immediates containing string data from			; Memcpy lowering should emit stores of immediates containing string data from
	; the correct offsets.			; the correct offsets.

	; CHECK-LABEL: foo:
	; CHECK: movb $0, 6(%rdi)
	; CHECK: movw $15212, 4(%rdi)
	; CHECK: movl $1802117222, (%rdi)
	define void @foo(i8* %tmp2) {			define void @foo(i8* %tmp2) {
				; X86-LABEL: foo:
				; X86: # %bb.0:
				; X86-NEXT: movl $3894379, 3(%rdi) # imm = 0x3B6C6B
				; X86-NEXT: movl $1802117222, (%rdi) # imm = 0x6B6A2066
				; X86-NEXT: retq
	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @0, i64 0, i64 3), i64 7, i1 false)			call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @0, i64 0, i64 3), i64 7, i1 false)
	ret void			ret void
	}			}

	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)			declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)

test/CodeGen/X86/memset-2.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s \| FileCheck %s			; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=yonah < %s \| FileCheck %s

	define fastcc void @t1() nounwind {			define fastcc void @t1() nounwind {
	; CHECK-LABEL: t1:			; CHECK-LABEL: t1:
	; CHECK: ## %bb.0: ## %entry			; CHECK: ## %bb.0: ## %entry
	; CHECK-NEXT: subl $16, %esp			; CHECK-NEXT: subl $16, %esp
	; CHECK-NEXT: pushl $188			; CHECK-NEXT: pushl $188
	; CHECK-NEXT: pushl $0			; CHECK-NEXT: pushl $0
	; CHECK-NEXT: pushl $0			; CHECK-NEXT: pushl $0
	; CHECK-NEXT: calll _memset			; CHECK-NEXT: calll _memset
	; CHECK-NEXT: addl $16, %esp			; CHECK-NEXT: addl $16, %esp
				; CHECK-NEXT: ud2
	entry:			entry:
	call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i1 false)			call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i1 false)
	unreachable			unreachable
	}			}

	define fastcc void @t2(i8 signext %c) nounwind {			define fastcc void @t2(i8 signext %c) nounwind {
	; CHECK-LABEL: t2:			; CHECK-LABEL: t2:
	; CHECK: ## %bb.0: ## %entry			; CHECK: ## %bb.0: ## %entry
	; CHECK-NEXT: subl $12, %esp			; CHECK-NEXT: subl $12, %esp
	; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)			; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
	; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp)			; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp)
	; CHECK-NEXT: calll _memset			; CHECK-NEXT: calll _memset
				; CHECK-NEXT: ud2
	entry:			entry:
	call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i1 false)			call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i1 false)
	unreachable			unreachable
	}			}

	declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) nounwind			declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) nounwind

	define void @t3(i8* nocapture %s, i8 %a) nounwind {			define void @t3(i8* nocapture %s, i8 %a) nounwind {
	Show All 11 Lines
	}			}

	define void @t4(i8* nocapture %s, i8 %a) nounwind {			define void @t4(i8* nocapture %s, i8 %a) nounwind {
	; CHECK-LABEL: t4:			; CHECK-LABEL: t4:
	; CHECK: ## %bb.0: ## %entry			; CHECK: ## %bb.0: ## %entry
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax			; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
	; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx			; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
	; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101			; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
				; CHECK-NEXT: movl %ecx, 11(%eax)
	; CHECK-NEXT: movl %ecx, 8(%eax)			; CHECK-NEXT: movl %ecx, 8(%eax)
	; CHECK-NEXT: movl %ecx, 4(%eax)			; CHECK-NEXT: movl %ecx, 4(%eax)
	; CHECK-NEXT: movl %ecx, (%eax)			; CHECK-NEXT: movl %ecx, (%eax)
	; CHECK-NEXT: movw %cx, 12(%eax)
	; CHECK-NEXT: movb %cl, 14(%eax)
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	entry:			entry:
	tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i1 false)			tail call void @llvm.memset.p0i8.i32(i8* %s, i8 %a, i32 15, i1 false)
	ret void			ret void
	}			}

test/CodeGen/X86/memset-zero.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc < %s -mtriple=i686-unknown-linux \| FileCheck %s --check-prefix=X86
				; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=core2 \| FileCheck %s --check-prefix=CORE2
				; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=nehalem \| FileCheck %s --check-prefix=NEHALEM

				declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind

				define void @memset_0(i8* %a) nounwind {
				; X86-LABEL: memset_0:
				; X86: # %bb.0: # %entry
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_0:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_0:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 0, i1 false)
				ret void
				}

				define void @memset_4(i8* %a) nounwind {
				; X86-LABEL: memset_4:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_4:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movl $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_4:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movl $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 4, i1 false)
				ret void
				}

				define void @memset_5(i8* %a) nounwind {
				; X86-LABEL: memset_5:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movb $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_5:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movb $0, 4(%rdi)
				; CORE2-NEXT: movl $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_5:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movb $0, 4(%rdi)
				; NEHALEM-NEXT: movl $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 5, i1 false)
				ret void
				}

				define void @memset_7(i8* %a) nounwind {
				; X86-LABEL: memset_7:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 3(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_7:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movl $0, 3(%rdi)
				; CORE2-NEXT: movl $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_7:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movl $0, 3(%rdi)
				; NEHALEM-NEXT: movl $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 7, i1 false)
				ret void
				}

				define void @memset_8(i8* %a) nounwind {
				; X86-LABEL: memset_8:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_8:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_8:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movq $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 8, i1 false)
				ret void
				}

				define void @memset_11(i8* %a) nounwind {
				; X86-LABEL: memset_11:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 7(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_11:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movl $0, 7(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_11:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movl $0, 7(%rdi)
				; NEHALEM-NEXT: movq $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 11, i1 false)
				ret void
				}

				define void @memset_13(i8* %a) nounwind {
				; X86-LABEL: memset_13:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movb $0, 12(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_13:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movq $0, 5(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_13:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movq $0, 5(%rdi)
				; NEHALEM-NEXT: movq $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 13, i1 false)
				ret void
				}

				define void @memset_15(i8* %a) nounwind {
				; X86-LABEL: memset_15:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 11(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_15:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movq $0, 7(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_15:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: movq $0, 7(%rdi)
				; NEHALEM-NEXT: movq $0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 15, i1 false)
				ret void
				}

				define void @memset_16(i8* %a) nounwind {
				; X86-LABEL: memset_16:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 12(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_16:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movq $0, 8(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_16:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: xorps %xmm0, %xmm0
				; NEHALEM-NEXT: movups %xmm0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 16, i1 false)
				ret void
				}

				define void @memset_17(i8* %a) nounwind {
				; X86-LABEL: memset_17:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movb $0, 16(%eax)
				; X86-NEXT: movl $0, 12(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_17:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movb $0, 16(%rdi)
				; CORE2-NEXT: movq $0, 8(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_17:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: xorps %xmm0, %xmm0
				; NEHALEM-NEXT: movups %xmm0, (%rdi)
				; NEHALEM-NEXT: movb $0, 16(%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 17, i1 false)
				ret void
				}

				define void @memset_19(i8* %a) nounwind {
				; X86-LABEL: memset_19:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 15(%eax)
				; X86-NEXT: movl $0, 12(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_19:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movl $0, 15(%rdi)
				; CORE2-NEXT: movq $0, 8(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_19:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: xorps %xmm0, %xmm0
				; NEHALEM-NEXT: movups %xmm0, (%rdi)
				; NEHALEM-NEXT: movl $0, 15(%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 19, i1 false)
				ret void
				}

				define void @memset_31(i8* %a) nounwind {
				; X86-LABEL: memset_31:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 27(%eax)
				; X86-NEXT: movl $0, 24(%eax)
				; X86-NEXT: movl $0, 20(%eax)
				; X86-NEXT: movl $0, 16(%eax)
				; X86-NEXT: movl $0, 12(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_31:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movq $0, 23(%rdi)
				; CORE2-NEXT: movq $0, 16(%rdi)
				; CORE2-NEXT: movq $0, 8(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_31:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: xorps %xmm0, %xmm0
				; NEHALEM-NEXT: movups %xmm0, 15(%rdi)
				; NEHALEM-NEXT: movups %xmm0, (%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 31, i1 false)
				ret void
				}

				define void @memset_35(i8* %a) nounwind {
				; X86-LABEL: memset_35:
				; X86: # %bb.0: # %entry
				; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X86-NEXT: movl $0, 31(%eax)
				pcordesUnsubmitted Not Done Reply Inline Actions There's a code-size vs. uop count tradeoff here. Zeroing one register with a 2-byte `xor %edx,%edx` would save 4 bytes in each of following `movl $imm32` instructions. Especially on CPUs without a uop-cache, it may well be a win to have one extra cheap uop go though the pipeline to avoid decode bottlenecks that might limit how far ahead the CPU can "see" in the instruction stream. pcordes: There's a code-size vs. uop count tradeoff here. Zeroing one register with a 2-byte `xor %edx…
				spatelUnsubmitted Not Done Reply Inline Actions See PR24448: https://bugs.llvm.org/show_bug.cgi?id=24448 ...and the related bugs. I'm still not sure how to solve that, so the bug has been sitting for a long time. spatel: See PR24448: https://bugs.llvm.org/show_bug.cgi?id=24448 ...and the related bugs. I'm still not…
				; X86-NEXT: movl $0, 28(%eax)
				; X86-NEXT: movl $0, 24(%eax)
				; X86-NEXT: movl $0, 20(%eax)
				; X86-NEXT: movl $0, 16(%eax)
				; X86-NEXT: movl $0, 12(%eax)
				; X86-NEXT: movl $0, 8(%eax)
				; X86-NEXT: movl $0, 4(%eax)
				; X86-NEXT: movl $0, (%eax)
				; X86-NEXT: retl
				;
				; CORE2-LABEL: memset_35:
				; CORE2: # %bb.0: # %entry
				; CORE2-NEXT: movl $0, 31(%rdi)
				pcordesUnsubmitted Not Done Reply Inline Actions In 64-bit mode, Intel CPUs won't micro-fuse an instruction that has an immediate and a rip-relative addressing mode. So if this was a static object being memset instead of a pointer in a register, each `mov` instruction would decode to 2 separate fused-domain uops (store-address and store-data. This would make it definitely worth it to zero a register and use `movl %ecx, 31+buf(%rip)`, `movq %rcx, 24+buf(%rip)`, etc. Even on Core2 where register-read stalls can be a problem, this is unlikely to hurt because it's written right before being read. Of course you also have the option of doing a RIP-relative LEA (7 bytes) to save 3 byte per instruction (reg+disp8 instead of RIP+rel32. But for static data you know the alignment so you can use `movaps` for all the aligned parts so you hopefully have few total instructions. pcordes: In 64-bit mode, Intel CPUs won't micro-fuse an instruction that has an immediate and a rip…
				courbetAuthorUnsubmitted Done Reply Inline Actions IIRU this would be addressed if we fix PR24448, right ? courbet: IIRU this would be addressed if we fix PR24448, right ?
				; CORE2-NEXT: movq $0, 24(%rdi)
				; CORE2-NEXT: movq $0, 16(%rdi)
				; CORE2-NEXT: movq $0, 8(%rdi)
				; CORE2-NEXT: movq $0, (%rdi)
				; CORE2-NEXT: retq
				;
				; NEHALEM-LABEL: memset_35:
				; NEHALEM: # %bb.0: # %entry
				; NEHALEM-NEXT: xorps %xmm0, %xmm0
				; NEHALEM-NEXT: movups %xmm0, 16(%rdi)
				; NEHALEM-NEXT: movups %xmm0, (%rdi)
				; NEHALEM-NEXT: movl $0, 31(%rdi)
				; NEHALEM-NEXT: retq
				entry:
				call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 35, i1 false)
				ret void
				}

test/CodeGen/X86/unaligned-load.ll

	; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 \| FileCheck -check-prefix=I386 %s			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 \| FileCheck -check-prefix=CORE2 %s			; RUN: llc < %s -mtriple=i386-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic \| FileCheck -check-prefix=I386 %s
	; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 \| FileCheck -check-prefix=COREI7 %s			; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic \| FileCheck -check-prefix=CORE2 %s
				; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic \| FileCheck -check-prefix=COREI7 %s

	@.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8			@.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8
	@.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8			@.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8

				; This can be improved; see PR39952.

	define void @func() nounwind ssp {			define void @func() nounwind ssp {
				; I386-LABEL: func:
				; I386: ## %bb.0: ## %entry
				; I386-NEXT: subl $32, %esp
				; I386-NEXT: .p2align 4, 0x90
				; I386-NEXT: LBB0_1: ## %bb
				; I386-NEXT: ## =>This Inner Loop Header: Depth=1
				; I386-NEXT: movl $4673097, {{[0-9]+}}(%esp) ## imm = 0x474E49
				; I386-NEXT: movl $1230132307, {{[0-9]+}}(%esp) ## imm = 0x49525453
				; I386-NEXT: movl $541347367, {{[0-9]+}}(%esp) ## imm = 0x20444E27
				; I386-NEXT: movl $840969293, {{[0-9]+}}(%esp) ## imm = 0x32202C4D
				; I386-NEXT: movl $1095911247, {{[0-9]+}}(%esp) ## imm = 0x4152474F
				; I386-NEXT: movl $1380982853, {{[0-9]+}}(%esp) ## imm = 0x52502045
				; I386-NEXT: movl $1313821779, {{[0-9]+}}(%esp) ## imm = 0x4E4F5453
				; I386-NEXT: movl $1498564676, (%esp) ## imm = 0x59524844
				; I386-NEXT: jmp LBB0_1
				;
				; CORE2-LABEL: func:
				; CORE2: ## %bb.0: ## %entry
				; CORE2-NEXT: movabsq $20070800167293728, %rax ## imm = 0x474E4952545320
				; CORE2-NEXT: movabsq $2325069237881678925, %rcx ## imm = 0x20444E2732202C4D
				; CORE2-NEXT: movabsq $4706902966564560965, %rdx ## imm = 0x4152474F52502045
				; CORE2-NEXT: movabsq $5642821575076104260, %rsi ## imm = 0x4E4F545359524844
				; CORE2-NEXT: .p2align 4, 0x90
				; CORE2-NEXT: LBB0_1: ## %bb
				; CORE2-NEXT: ## =>This Inner Loop Header: Depth=1
				; CORE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: jmp LBB0_1
				;
				; COREI7-LABEL: func:
				; COREI7: ## %bb.0: ## %entry
				; COREI7-NEXT: movups _.str3+{{.*}}(%rip), %xmm0
				; COREI7-NEXT: movups {{.*}}(%rip), %xmm1
				; COREI7-NEXT: .p2align 4, 0x90
				; COREI7-NEXT: LBB0_1: ## %bb
				; COREI7-NEXT: ## =>This Inner Loop Header: Depth=1
				; COREI7-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
				; COREI7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
				; COREI7-NEXT: jmp LBB0_1
	entry:			entry:
				pcordesUnsubmitted Not Done Reply Inline Actions We can align stack objects to a 16-byte boundary, or at worst we know their alignment relative to a 16-byte boundary. We can use `movaps` for the aligned part at least, even if we use scalar stores for the unaligned start/end (potentially overlapping the vector store). `movaps` is fast on any CPU that has it; only `movups` is slow on pre-Nehalem. Storing the first few bytes of an object with a 4-byte `mov`-immediate might be bad for store-forwarding if it's an array or struct of 8-byte elements. (But the x86-64 System V ABI requires that any array on the stack outside a struct has 16-byte alignment if it's at least 16 bytes in size. So misaligned arrays that we access relative to RSP should normally only happen inside a struct.) pcordes: We can align stack objects to a 16-byte boundary, or at worst we know their alignment…
				courbetAuthorUnsubmitted Done Reply Inline Actions We can align stack objects to a 16-byte boundary, or at worst we know their alignment relative to a 16-byte boundary. Good point. I've added a test in this file to show what happens when the data is aligned: we're also failing to select movabs. I've filed PR39952. courbet: > We can align stack objects to a 16-byte boundary, or at worst we know their alignment…
	%String2Loc = alloca [31 x i8], align 1			%String2Loc = alloca [31 x i8], align 1
	br label %bb			br label %bb

	bb: ; preds = %bb, %entry			bb: ; preds = %bb, %entry
	%String2Loc9 = getelementptr inbounds [31 x i8], [31 x i8]* %String2Loc, i64 0, i64 0			%String2Loc9 = getelementptr inbounds [31 x i8], [31 x i8]* %String2Loc, i64 0, i64 0
	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str3, i64 0, i64 0), i64 31, i1 false)			call void @llvm.memcpy.p0i8.p0i8.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str3, i64 0, i64 0), i64 31, i1 false)
	br label %bb			br label %bb

	return: ; No predecessors!			return: ; No predecessors!
	ret void			ret void
	}			}

	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind			define void @func_aligned() nounwind ssp {
				; I386-LABEL: func_aligned:
				; I386: ## %bb.0: ## %entry
				; I386-NEXT: subl $44, %esp
				; I386-NEXT: movaps {{.*#+}} xmm0 = [1498564676,1313821779,1380982853,1095911247]
				; I386-NEXT: .p2align 4, 0x90
				; I386-NEXT: LBB1_1: ## %bb
				; I386-NEXT: ## =>This Inner Loop Header: Depth=1
				; I386-NEXT: movaps %xmm0, (%esp)
				; I386-NEXT: movl $4673097, {{[0-9]+}}(%esp) ## imm = 0x474E49
				; I386-NEXT: movl $1230132307, {{[0-9]+}}(%esp) ## imm = 0x49525453
				; I386-NEXT: movl $541347367, {{[0-9]+}}(%esp) ## imm = 0x20444E27
				; I386-NEXT: movl $840969293, {{[0-9]+}}(%esp) ## imm = 0x32202C4D
				; I386-NEXT: jmp LBB1_1
				;
				; CORE2-LABEL: func_aligned:
				; CORE2: ## %bb.0: ## %entry
				; CORE2-NEXT: movabsq $20070800167293728, %rax ## imm = 0x474E4952545320
				; CORE2-NEXT: movabsq $2325069237881678925, %rcx ## imm = 0x20444E2732202C4D
				; CORE2-NEXT: movabsq $4706902966564560965, %rdx ## imm = 0x4152474F52502045
				; CORE2-NEXT: movabsq $5642821575076104260, %rsi ## imm = 0x4E4F545359524844
				; CORE2-NEXT: .p2align 4, 0x90
				; CORE2-NEXT: LBB1_1: ## %bb
				; CORE2-NEXT: ## =>This Inner Loop Header: Depth=1
				; CORE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
				; CORE2-NEXT: jmp LBB1_1
				;
				; COREI7-LABEL: func_aligned:
				; COREI7: ## %bb.0: ## %entry
				; COREI7-NEXT: movups _.str3+{{.*}}(%rip), %xmm0
				; COREI7-NEXT: movups {{.*}}(%rip), %xmm1
				; COREI7-NEXT: .p2align 4, 0x90
				; COREI7-NEXT: LBB1_1: ## %bb
				; COREI7-NEXT: ## =>This Inner Loop Header: Depth=1
				; COREI7-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
				; COREI7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
				; COREI7-NEXT: jmp LBB1_1
				entry:
				%String2Loc = alloca [31 x i8], align 16
				br label %bb

	; I386: calll {{_?}}memcpy			bb: ; preds = %bb, %entry
				%String2Loc9 = getelementptr inbounds [31 x i8], [31 x i8]* %String2Loc, i64 0, i64 0
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str3, i64 0, i64 0), i64 31, i1 false)
				br label %bb

				return: ; No predecessors!
				ret void
				}

	; CORE2: movabsq			declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
	; CORE2: movabsq
	; CORE2: movabsq

	; COREI7: movups _.str3

	; CORE2: .section
	; CORE2: .p2align 3
	; CORE2-NEXT: _.str1:
	; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING"
	; CORE2: .p2align 3
	; CORE2-NEXT: _.str3:

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen] Allow mempcy/memset to generate small overlapping stores.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 177690

lib/CodeGen/SelectionDAG/SelectionDAG.cpp

lib/Target/ARM/ARMLegalizerInfo.cpp

test/CodeGen/AArch64/arm64-memcpy-inline.ll

test/CodeGen/PowerPC/jaggedstructs.ll

test/CodeGen/PowerPC/structsinmem.ll

test/CodeGen/PowerPC/structsinregs.ll

test/CodeGen/X86/memcpy-from-string.ll

test/CodeGen/X86/memset-2.ll

test/CodeGen/X86/memset-zero.ll

test/CodeGen/X86/unaligned-load.ll

This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen] Allow mempcy/memset to generate small overlapping stores.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 177690

lib/CodeGen/SelectionDAG/SelectionDAG.cpp

lib/Target/ARM/ARMLegalizerInfo.cpp

test/CodeGen/AArch64/arm64-memcpy-inline.ll

test/CodeGen/PowerPC/jaggedstructs.ll

test/CodeGen/PowerPC/structsinmem.ll

test/CodeGen/PowerPC/structsinregs.ll

test/CodeGen/X86/memcpy-from-string.ll

test/CodeGen/X86/memset-2.ll

test/CodeGen/X86/memset-zero.ll

test/CodeGen/X86/unaligned-load.ll

[CodeGen] Allow mempcy/memset to generate small overlapping stores.
ClosedPublic