Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -773,6 +773,7 @@ def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8454,6 +8454,17 @@ (INSERT_get_vinsert128_imm VR256:$ins))>; } +// Combine two consecutive 16-byte loads with a common destination register into +// one 32-byte load to that register. +let Predicates = [HasAVX, HasFastMem32] in { + // TODO: Add patterns for other data types, aligned ops, and stores. + def : Pat<(insert_subvector + (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), + (loadv4f32 (add addr:$src, (iPTR 16))), + (iPTR 4)), + (VMOVUPSYrm addr:$src)>; +} + let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), Index: test/CodeGen/X86/unaligned-32-byte-memops.ll =================================================================== --- test/CodeGen/X86/unaligned-32-byte-memops.ll +++ test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -44,3 +44,29 @@ store <8 x float> %A, <8 x float>* %P, align 16 ret void } + +define <8 x float> @combine_16_byte_loads(float* nocapture readonly %ptr) { + ; CHECK-LABEL: combine_16_byte_loads + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %p1 = bitcast float* %ptr to <4 x float>* + %v1 = load <4 x float>* %p1, align 1 + %ptr2 = getelementptr inbounds float* %ptr, i64 4 + %p2 = bitcast float* %ptr2 to <4 x float>* + %v2 = load <4 x float>* %p2, align 1 + %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) + ret <8 x float> %v3 +} + +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) +