Skip to content

Commit 35c06a0

Browse files
committedJun 7, 2016
[X86][SSE] Add general lowering of nontemporal vector loads (fixed bad merge)
Currently the only way to use the (V)MOVNTDQA nontemporal vector loads instructions is through the int_x86_sse41_movntdqa style builtins. This patch adds support for lowering nontemporal loads from general IR, allowing us to remove the movntdqa builtins in a future patch. We currently still fold nontemporal loads into suitable instructions, we should probably look at removing this (and nontemporal stores as well) or at least make the target's folding implementation aware that its dealing with a nontemporal memory transaction. There is also an issue that VMOVNTDQA only acts on 128-bit vectors on pre-AVX2 hardware - so currently a normal ymm load is still used on AVX1 targets. Differential Review: http://reviews.llvm.org/D20965 llvm-svn: 272011
1 parent 9a89623 commit 35c06a0

File tree

1 file changed

+37
-9
lines changed

1 file changed

+37
-9
lines changed
 

‎llvm/lib/Target/X86/X86FastISel.cpp

+37-9
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,11 @@ bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
348348
bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
349349
MachineMemOperand *MMO, unsigned &ResultReg,
350350
unsigned Alignment) {
351+
bool HasSSE41 = Subtarget->hasSSE41();
351352
bool HasAVX = Subtarget->hasAVX();
353+
bool HasAVX2 = Subtarget->hasAVX2();
354+
bool IsNonTemporal = MMO && MMO->isNonTemporal();
355+
352356
// Get opcode and regclass of the output for the given load instruction.
353357
unsigned Opc = 0;
354358
const TargetRegisterClass *RC = nullptr;
@@ -394,14 +398,18 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
394398
// No f80 support yet.
395399
return false;
396400
case MVT::v4f32:
397-
if (Alignment >= 16)
401+
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
402+
Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
403+
else if (Alignment >= 16)
398404
Opc = HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm;
399405
else
400406
Opc = HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
401407
RC = &X86::VR128RegClass;
402408
break;
403409
case MVT::v2f64:
404-
if (Alignment >= 16)
410+
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
411+
Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
412+
else if (Alignment >= 16)
405413
Opc = HasAVX ? X86::VMOVAPDrm : X86::MOVAPDrm;
406414
else
407415
Opc = HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
@@ -411,38 +419,55 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
411419
case MVT::v2i64:
412420
case MVT::v8i16:
413421
case MVT::v16i8:
414-
if (Alignment >= 16)
422+
if (IsNonTemporal && Alignment >= 16)
423+
Opc = HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
424+
else if (Alignment >= 16)
415425
Opc = HasAVX ? X86::VMOVDQArm : X86::MOVDQArm;
416426
else
417427
Opc = HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
418428
RC = &X86::VR128RegClass;
419429
break;
420430
case MVT::v8f32:
421431
assert(HasAVX);
422-
Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm;
432+
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
433+
Opc = X86::VMOVNTDQAYrm;
434+
else
435+
Opc = (Alignment >= 32) ? X86::VMOVAPSYrm : X86::VMOVUPSYrm;
423436
RC = &X86::VR256RegClass;
424437
break;
425438
case MVT::v4f64:
426439
assert(HasAVX);
427-
Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm;
440+
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
441+
Opc = X86::VMOVNTDQAYrm;
442+
else
443+
Opc = (Alignment >= 32) ? X86::VMOVAPDYrm : X86::VMOVUPDYrm;
428444
RC = &X86::VR256RegClass;
429445
break;
430446
case MVT::v8i32:
431447
case MVT::v4i64:
432448
case MVT::v16i16:
433449
case MVT::v32i8:
434450
assert(HasAVX);
435-
Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm;
451+
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
452+
Opc = X86::VMOVNTDQAYrm;
453+
else
454+
Opc = (Alignment >= 32) ? X86::VMOVDQAYrm : X86::VMOVDQUYrm;
436455
RC = &X86::VR256RegClass;
437456
break;
438457
case MVT::v16f32:
439458
assert(Subtarget->hasAVX512());
440-
Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
459+
if (IsNonTemporal && Alignment >= 64)
460+
Opc = X86::VMOVNTDQAZrm;
461+
else
462+
Opc = (Alignment >= 64) ? X86::VMOVAPSZrm : X86::VMOVUPSZrm;
441463
RC = &X86::VR512RegClass;
442464
break;
443465
case MVT::v8f64:
444466
assert(Subtarget->hasAVX512());
445-
Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
467+
if (IsNonTemporal && Alignment >= 64)
468+
Opc = X86::VMOVNTDQAZrm;
469+
else
470+
Opc = (Alignment >= 64) ? X86::VMOVAPDZrm : X86::VMOVUPDZrm;
446471
RC = &X86::VR512RegClass;
447472
break;
448473
case MVT::v8i64:
@@ -452,7 +477,10 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
452477
assert(Subtarget->hasAVX512());
453478
// Note: There are a lot more choices based on type with AVX-512, but
454479
// there's really no advantage when the load isn't masked.
455-
Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
480+
if (IsNonTemporal && Alignment >= 64)
481+
Opc = X86::VMOVNTDQAZrm;
482+
else
483+
Opc = (Alignment >= 64) ? X86::VMOVDQA64Zrm : X86::VMOVDQU64Zrm;
456484
RC = &X86::VR512RegClass;
457485
break;
458486
}

0 commit comments

Comments
 (0)
Please sign in to comment.