Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -773,6 +773,7 @@ def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8454,6 +8454,17 @@ (INSERT_get_vinsert128_imm VR256:$ins))>; } +// Combine two consecutive 16-byte loads with a common destination register into +// one 32-byte load to that register. +let Predicates = [HasAVX, HasFastMem32] in { + // TODO: Add patterns for other data types, aligned ops, and stores. + def : Pat<(insert_subvector + (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), + (loadv4f32 (add addr:$src, (iPTR 16))), + (iPTR 4)), + (VMOVUPSYrm addr:$src)>; +} + let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), Index: test/CodeGen/X86/unaligned-32-byte-memops.ll =================================================================== --- test/CodeGen/X86/unaligned-32-byte-memops.ll +++ test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -1,7 +1,7 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK ; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load ; because that is slower than two 16-byte loads. @@ -44,3 +44,103 @@ store <8 x float> %A, <8 x float>* %P, align 16 ret void } + +; Merge two consecutive 16-byte subvector loads into a single 32-byte load +; if it's faster. + +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) + +; Use the vinsertf128 intrinsic to model source code +; that explicitly uses AVX intrinsics. +define <8 x float> @combine_16_byte_loads(float* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %p1 = bitcast float* %ptr to <4 x float>* + %v1 = load <4 x float>* %p1, align 1 + %ptr2 = getelementptr inbounds float* %ptr, i64 4 + %p2 = bitcast float* %ptr2 to <4 x float>* + %v2 = load <4 x float>* %p2, align 1 + %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) + ret <8 x float> %v3 +} + +; Swap the operands of the shufflevector and vinsertf128 to ensure that the +; pattern still matches. +define <8 x float> @combine_16_byte_loads_swap(float* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_swap + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %p1 = bitcast float* %ptr to <4 x float>* + %v1 = load <4 x float>* %p1, align 1 + %ptr2 = getelementptr inbounds float* %ptr, i64 4 + %p2 = bitcast float* %ptr2 to <4 x float>* + %v2 = load <4 x float>* %p2, align 1 + %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) + ret <8 x float> %v3 +} + +; Replace the vinsertf128 intrinsic with a shufflevector as might be +; expected from auto-vectorized code. +define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %v1 = load <4 x float>* %ptr, align 1 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v2 = load <4 x float>* %ptr2, align 1 + %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> + ret <8 x float> %v3 +} + +; Swap the order of the shufflevector operands to ensure that the +; pattern still matches. +define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %v1 = load <4 x float>* %ptr, align 1 + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v2 = load <4 x float>* %ptr2, align 1 + %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> + ret <8 x float> %v3 +} +