Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -953,6 +953,18 @@ return St->getAlignment() < St->getMemoryVT().getStoreSize(); }]>; +// nontemporal load fragments. +def nontemporalload : PatFrag<(ops node:$ptr), + (load node:$ptr), [{ + return cast(N)->isNonTemporal(); +}]>; + +def alignednontemporalload : PatFrag<(ops node:$ptr), + (nontemporalload node:$ptr), [{ + LoadSDNode *Ld = cast(N); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); +}]>; + // setcc convenience fragments. def setoeq : PatFrag<(ops node:$lhs, node:$rhs), (setcc node:$lhs, node:$rhs, SETOEQ)>; Index: lib/Target/X86/X86FastISel.cpp =================================================================== --- lib/Target/X86/X86FastISel.cpp +++ lib/Target/X86/X86FastISel.cpp @@ -348,7 +348,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg, unsigned Alignment) { + bool HasSSE41 = Subtarget->hasSSE41(); bool HasAVX = Subtarget->hasAVX(); + bool HasAVX2 = Subtarget->hasAVX2(); + bool IsNonTemporal = MMO && MMO->isNonTemporal(); + // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; @@ -394,14 +398,18 @@ // No f80 support yet. return false; case MVT::v4f32: - if (Alignment >= 16) + if (IsNonTemporal && Alignment >= 16 && HasSSE41) + Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + else if (Alignment >= 16) Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm; else Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; RC = &X86::VR128RegClass; break; case MVT::v2f64: - if (Alignment >= 16) + if (IsNonTemporal && Alignment >= 16 && HasSSE41) + Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + else if (Alignment >= 16) Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm; else Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; @@ -411,7 +419,9 @@ case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - if (Alignment >= 16) + if (IsNonTemporal && Alignment >= 16) + Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm; + else if (Alignment >= 16) Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm; else Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; @@ -419,12 +429,18 @@ break; case MVT::v8f32: assert(HasAVX); - Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm; + if (IsNonTemporal && Alignment >= 32 && HasAVX2) + Opc = X86::VMOVNTDQAYrm; + else + Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm; RC = &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); - Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm; + if (IsNonTemporal && Alignment >= 32 && HasAVX2) + Opc = X86::VMOVNTDQAYrm; + else + Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm; RC = &X86::VR256RegClass; break; case MVT::v8i32: @@ -432,17 +448,26 @@ case MVT::v16i16: case MVT::v32i8: assert(HasAVX); - Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm; + if (IsNonTemporal && Alignment >= 32 && HasAVX2) + Opc = X86::VMOVNTDQAYrm; + else + Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm; RC = &X86::VR256RegClass; break; case MVT::v16f32: assert(Subtarget->hasAVX512()); - Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm; + if (IsNonTemporal && Alignment >= 64) + Opc = X86::VMOVNTDQAZrm; + else + Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm; RC = &X86::VR512RegClass; break; case MVT::v8f64: assert(Subtarget->hasAVX512()); - Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm; + if (IsNonTemporal && Alignment >= 64) + Opc = X86::VMOVNTDQAZrm; + else + Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm; RC = &X86::VR512RegClass; break; case MVT::v8i64: @@ -452,7 +477,10 @@ assert(Subtarget->hasAVX512()); // Note: There are a lot more choices based on type with AVX-512, but // there's really no advantage when the load isn't masked. - Opc = (Alignment >= 64) ? X86::VMOVDQA64Zmr : X86::VMOVDQU64Zmr; + if (IsNonTemporal && Alignment >= 64) + Opc = X86::VMOVNTDQAZrm; + else + Opc = (Alignment >= 64) ? X86::VMOVDQA64Zmr : X86::VMOVDQU64Zmr; RC = &X86::VR512RegClass; break; } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3321,6 +3321,19 @@ (VMOVNTDQZmr addr:$dst, VR512:$src)>; def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst), (VMOVNTDQZmr addr:$dst, VR512:$src)>; + + def : Pat<(v8f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v16f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v8i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v16i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v32i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v64i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; } let Predicates = [HasVLX], AddedComplexity = 400 in { @@ -3331,12 +3344,38 @@ def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst), (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(v4f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v8f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v4i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v8i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v16i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v32i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst), (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst), (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst), (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; + + def : Pat<(v2f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v4f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v2i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v4i32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v8i16 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v16i8 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; } //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -7248,6 +7248,7 @@ (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; } +let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteLoad] in { let Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), @@ -7264,6 +7265,35 @@ [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; } // SchedRW +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v4f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v4i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v2f64 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v2i64 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v4f32 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v2f64 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v2i64 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; +} + +} // AddedComplexity + //===----------------------------------------------------------------------===// // SSE4.2 - Compare Instructions //===----------------------------------------------------------------------===// Index: test/CodeGen/X86/fast-isel-nontemporal.ll =================================================================== --- test/CodeGen/X86/fast-isel-nontemporal.ll +++ test/CodeGen/X86/fast-isel-nontemporal.ll @@ -220,19 +220,29 @@ ; define <4 x float> @test_load_nt4xfloat(<4 x float>* nocapture %ptr) { -; SSE-LABEL: test_load_nt4xfloat: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt4xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_load_nt4xfloat: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt4xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_load_nt4xfloat: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_load_nt4xfloat: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %0 = load <4 x float>, <4 x float>* %ptr, align 16, !nontemporal !1 @@ -240,19 +250,29 @@ } define <2 x double> @test_load_nt2xdouble(<2 x double>* nocapture %ptr) { -; SSE-LABEL: test_load_nt2xdouble: -; SSE: # BB#0: # %entry -; SSE-NEXT: movapd (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt2xdouble: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_load_nt2xdouble: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movapd (%rdi), %xmm0 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt2xdouble: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_load_nt2xdouble: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovapd (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_load_nt2xdouble: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovapd (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %0 = load <2 x double>, <2 x double>* %ptr, align 16, !nontemporal !1 @@ -262,17 +282,17 @@ define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) { ; SSE-LABEL: test_load_nt16xi8: ; SSE: # BB#0: # %entry -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movntdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_load_nt16xi8: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_load_nt16xi8: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %0 = load <16 x i8>, <16 x i8>* %ptr, align 16, !nontemporal !1 @@ -282,17 +302,17 @@ define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) { ; SSE-LABEL: test_load_nt8xi16: ; SSE: # BB#0: # %entry -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movntdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_load_nt8xi16: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_load_nt8xi16: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %0 = load <8 x i16>, <8 x i16>* %ptr, align 16, !nontemporal !1 @@ -302,17 +322,17 @@ define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) { ; SSE-LABEL: test_load_nt4xi32: ; SSE: # BB#0: # %entry -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movntdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_load_nt4xi32: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_load_nt4xi32: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %0 = load <4 x i32>, <4 x i32>* %ptr, align 16, !nontemporal !1 @@ -322,17 +342,17 @@ define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) { ; SSE-LABEL: test_load_nt2xi64: ; SSE: # BB#0: # %entry -; SSE-NEXT: movdqa (%rdi), %xmm0 +; SSE-NEXT: movntdqa (%rdi), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_load_nt2xi64: ; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_load_nt2xi64: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq entry: %0 = load <2 x i64>, <2 x i64>* %ptr, align 16, !nontemporal !1 @@ -480,20 +500,37 @@ ; define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) { -; SSE-LABEL: test_load_nt8xfloat: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt8xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt8xfloat: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt8xfloat: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt8xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt8xfloat: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt8xfloat: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt8xfloat: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %0 = load <8 x float>, <8 x float>* %ptr, align 32, !nontemporal !1 @@ -501,20 +538,37 @@ } define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) { -; SSE-LABEL: test_load_nt4xdouble: -; SSE: # BB#0: # %entry -; SSE-NEXT: movapd (%rdi), %xmm0 -; SSE-NEXT: movapd 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt4xdouble: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd (%rdi), %xmm0 +; SSE2-NEXT: movapd 16(%rdi), %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt4xdouble: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovapd (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt4xdouble: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movapd (%rdi), %xmm0 +; SSE4A-NEXT: movapd 16(%rdi), %xmm1 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt4xdouble: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt4xdouble: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovapd (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt4xdouble: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt4xdouble: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovapd (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %0 = load <4 x double>, <4 x double>* %ptr, align 32, !nontemporal !1 @@ -522,20 +576,37 @@ } define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) { -; SSE-LABEL: test_load_nt32xi8: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt32xi8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt32xi8: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt32xi8: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt32xi8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt32xi8: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt32xi8: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt32xi8: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %0 = load <32 x i8>, <32 x i8>* %ptr, align 32, !nontemporal !1 @@ -543,20 +614,37 @@ } define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) { -; SSE-LABEL: test_load_nt16xi16: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt16xi16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt16xi16: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt16xi16: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt16xi16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt16xi16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt16xi16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt16xi16: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %0 = load <16 x i16>, <16 x i16>* %ptr, align 32, !nontemporal !1 @@ -564,20 +652,37 @@ } define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) { -; SSE-LABEL: test_load_nt8xi32: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt8xi32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt8xi32: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt8xi32: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt8xi32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt8xi32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt8xi32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt8xi32: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %0 = load <8 x i32>, <8 x i32>* %ptr, align 32, !nontemporal !1 @@ -585,20 +690,37 @@ } define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) { -; SSE-LABEL: test_load_nt4xi64: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt4xi64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt4xi64: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt4xi64: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt4xi64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt4xi64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt4xi64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt4xi64: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* %ptr, align 32, !nontemporal !1 @@ -776,23 +898,45 @@ ; define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) { -; SSE-LABEL: test_load_nt16xfloat: -; SSE: # BB#0: # %entry -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt16xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt16xfloat: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt16xfloat: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movaps (%rdi), %xmm0 +; SSE4A-NEXT: movaps 16(%rdi), %xmm1 +; SSE4A-NEXT: movaps 32(%rdi), %xmm2 +; SSE4A-NEXT: movaps 48(%rdi), %xmm3 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt16xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt16xfloat: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt16xfloat: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt16xfloat: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %zmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512-NEXT: retq entry: %0 = load <16 x float>, <16 x float>* %ptr, align 64, !nontemporal !1 @@ -800,23 +944,45 @@ } define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) { -; SSE-LABEL: test_load_nt8xdouble: -; SSE: # BB#0: # %entry -; SSE-NEXT: movapd (%rdi), %xmm0 -; SSE-NEXT: movapd 16(%rdi), %xmm1 -; SSE-NEXT: movapd 32(%rdi), %xmm2 -; SSE-NEXT: movapd 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_load_nt8xdouble: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd (%rdi), %xmm0 +; SSE2-NEXT: movapd 16(%rdi), %xmm1 +; SSE2-NEXT: movapd 32(%rdi), %xmm2 +; SSE2-NEXT: movapd 48(%rdi), %xmm3 +; SSE2-NEXT: retq ; -; AVX-LABEL: test_load_nt8xdouble: -; AVX: # BB#0: # %entry -; AVX-NEXT: vmovapd (%rdi), %ymm0 -; AVX-NEXT: vmovapd 32(%rdi), %ymm1 -; AVX-NEXT: retq +; SSE4A-LABEL: test_load_nt8xdouble: +; SSE4A: # BB#0: # %entry +; SSE4A-NEXT: movapd (%rdi), %xmm0 +; SSE4A-NEXT: movapd 16(%rdi), %xmm1 +; SSE4A-NEXT: movapd 32(%rdi), %xmm2 +; SSE4A-NEXT: movapd 48(%rdi), %xmm3 +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_load_nt8xdouble: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_load_nt8xdouble: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovapd (%rdi), %ymm0 +; AVX1-NEXT: vmovapd 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_load_nt8xdouble: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_load_nt8xdouble: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovapd (%rdi), %zmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512-NEXT: retq entry: %0 = load <8 x double>, <8 x double>* %ptr, align 64, !nontemporal !1 Index: test/CodeGen/X86/nontemporal-loads.ll =================================================================== --- test/CodeGen/X86/nontemporal-loads.ll +++ test/CodeGen/X86/nontemporal-loads.ll @@ -10,43 +10,53 @@ ; FIXME: Tests for nontemporal load support which was introduced in SSE41 define <4 x float> @test_v4f32(<4 x float>* %src) { -; SSE-LABEL: test_v4f32: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4f32: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f32: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512-NEXT: retq %1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1 ret <4 x float> %1 } define <4 x i32> @test_v4i32(<4 x i32>* %src) { -; SSE-LABEL: test_v4i32: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; ; AVX512F-LABEL: test_v4i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 +; AVX512F-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_v4i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v4i32: @@ -58,117 +68,97 @@ } define <2 x double> @test_v2f64(<2 x double>* %src) { -; SSE-LABEL: test_v2f64: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test_v2f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v2f64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovapd (%rdi), %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2f64: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1 ret <2 x double> %1 } define <2 x i64> @test_v2i64(<2 x i64>* %src) { -; SSE-LABEL: test_v2i64: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2i64: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test_v2i64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v2i64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v2i64: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1 ret <2 x i64> %1 } define <8 x i16> @test_v8i16(<8 x i16>* %src) { -; SSE-LABEL: test_v8i16: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test_v8i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v8i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v8i16: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1 ret <8 x i16> %1 } define <16 x i8> @test_v16i8(<16 x i8>* %src) { -; SSE-LABEL: test_v16i8: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16i8: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovntdqa (%rdi), %xmm0 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test_v16i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v16i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %xmm0 +; AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1 ret <16 x i8> %1 } @@ -176,45 +166,67 @@ ; And now YMM versions. define <8 x float> @test_v8f32(<8 x float>* %src) { -; SSE-LABEL: test_v8f32: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f32: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f32: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8f32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8f32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8f32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8f32: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512-NEXT: retq %1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1 ret <8 x float> %1 } define <8 x i32> @test_v8i32(<8 x i32>* %src) { -; SSE-LABEL: test_v8i32: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8i32: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i32: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_v8i32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 +; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_v8i32: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v8i32: @@ -226,121 +238,125 @@ } define <4 x double> @test_v4f64(<4 x double>* %src) { -; SSE-LABEL: test_v4f64: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4f64: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE2-LABEL: test_v4f64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4f64: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq ; -; AVX512F-LABEL: test_v4f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: retq +; AVX1-LABEL: test_v4f64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq ; -; AVX512BW-LABEL: test_v4f64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: retq +; AVX2-LABEL: test_v4f64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; -; AVX512VL-LABEL: test_v4f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v4f64: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1 ret <4 x double> %1 } define <4 x i64> @test_v4i64(<4 x i64>* %src) { -; SSE-LABEL: test_v4i64: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i64: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE2-LABEL: test_v4i64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v4i64: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq ; -; AVX512F-LABEL: test_v4i64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: retq +; AVX1-LABEL: test_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq ; -; AVX512BW-LABEL: test_v4i64: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: retq +; AVX2-LABEL: test_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; -; AVX512VL-LABEL: test_v4i64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v4i64: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1 ret <4 x i64> %1 } define <16 x i16> @test_v16i16(<16 x i16>* %src) { -; SSE-LABEL: test_v16i16: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: test_v16i16: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE2-LABEL: test_v16i16: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i16: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq ; -; AVX512F-LABEL: test_v16i16: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: retq +; AVX1-LABEL: test_v16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq ; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: retq +; AVX2-LABEL: test_v16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v16i16: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1 ret <16 x i16> %1 } define <32 x i8> @test_v32i8(<32 x i8>* %src) { -; SSE-LABEL: test_v32i8: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: test_v32i8: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: retq +; SSE2-LABEL: test_v32i8: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i8: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: retq ; -; AVX512F-LABEL: test_v32i8: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: retq +; AVX1-LABEL: test_v32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: retq ; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: retq +; AVX2-LABEL: test_v32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: retq ; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i8: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1 ret <32 x i8> %1 } @@ -348,162 +364,246 @@ ; And now ZMM versions. define <16 x float> @test_v16f32(<16 x float>* %src) { -; SSE-LABEL: test_v16f32: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f32: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f32: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v16f32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16f32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16f32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16f32: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps (%rdi), %zmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1 ret <16 x float> %1 } define <16 x i32> @test_v16i32(<16 x i32>* %src) { -; SSE-LABEL: test_v16i32: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16i32: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16i32: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v16i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v16i32: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1 ret <16 x i32> %1 } define <8 x double> @test_v8f64(<8 x double>* %src) { -; SSE-LABEL: test_v8f64: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8f64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8f64: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8f64: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8f64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8f64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8f64: ; AVX512: # BB#0: -; AVX512-NEXT: vmovapd (%rdi), %zmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1 ret <8 x double> %1 } define <8 x i64> @test_v8i64(<8 x i64>* %src) { -; SSE-LABEL: test_v8i64: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_v8i64: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v8i64: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v8i64: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_v8i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_v8i64: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1 ret <8 x i64> %1 } define <32 x i16> @test_v32i16(<32 x i16>* %src) { -; SSE-LABEL: test_v32i16: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_v32i16: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v32i16: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v32i16: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_v32i16: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v32i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_v32i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_v32i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v32i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa64 32(%rdi), %ymm1 +; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1 ret <32 x i16> %1 } define <64 x i8> @test_v64i8(<64 x i8>* %src) { -; SSE-LABEL: test_v64i8: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movaps 16(%rdi), %xmm1 -; SSE-NEXT: movaps 32(%rdi), %xmm2 -; SSE-NEXT: movaps 48(%rdi), %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_v64i8: +; SSE2: # BB#0: +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: movaps 32(%rdi), %xmm2 +; SSE2-NEXT: movaps 48(%rdi), %xmm3 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v64i8: +; SSE41: # BB#0: +; SSE41-NEXT: movntdqa (%rdi), %xmm0 +; SSE41-NEXT: movntdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movntdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movntdqa 48(%rdi), %xmm3 +; SSE41-NEXT: retq ; -; AVX-LABEL: test_v64i8: -; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: retq +; AVX1-LABEL: test_v64i8: +; AVX1: # BB#0: +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v64i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_v64i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_v64i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VL-LABEL: test_v64i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa64 32(%rdi), %ymm1 +; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1 ret <64 x i8> %1