Index: lib/Target/Hexagon/HexagonISelDAGToDAG.cpp =================================================================== --- lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -241,22 +241,31 @@ case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: - if (isAlignedMemNode(LD)) - Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai; - else + if (isAlignedMemNode(LD)) { + if (LD->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vL32b_nt_pi : Hexagon::V6_vL32b_nt_ai; + else + Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai; + } else { Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi : Hexagon::V6_vL32Ub_ai; + } break; // 128B case MVT::v128i8: case MVT::v64i16: case MVT::v32i32: case MVT::v16i64: - if (isAlignedMemNode(LD)) - Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B - : Hexagon::V6_vL32b_ai_128B; - else + if (isAlignedMemNode(LD)) { + if (LD->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vL32b_nt_pi_128B + : Hexagon::V6_vL32b_nt_ai_128B; + else + Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B + : Hexagon::V6_vL32b_ai_128B; + } else { Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi_128B : Hexagon::V6_vL32Ub_ai_128B; + } break; default: llvm_unreachable("Unexpected memory type in indexed load"); @@ -529,22 +538,31 @@ case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: - if (isAlignedMemNode(ST)) - Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai; - else + if (isAlignedMemNode(ST)) { + if (ST->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vS32b_nt_pi : Hexagon::V6_vS32b_nt_ai; + else + Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai; + } else { Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi : Hexagon::V6_vS32Ub_ai; + } break; // 128B case MVT::v128i8: case MVT::v64i16: case MVT::v32i32: case MVT::v16i64: - if (isAlignedMemNode(ST)) - Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B - : Hexagon::V6_vS32b_ai_128B; - else + if (isAlignedMemNode(ST)) { + if (ST->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vS32b_nt_pi_128B + : Hexagon::V6_vS32b_nt_ai_128B; + else + Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B + : Hexagon::V6_vS32b_ai_128B; + } else { Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi_128B : Hexagon::V6_vS32Ub_ai_128B; + } break; default: llvm_unreachable("Unexpected memory type in indexed store"); Index: lib/Target/Hexagon/HexagonInstrInfo.cpp =================================================================== --- lib/Target/Hexagon/HexagonInstrInfo.cpp +++ lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -250,15 +250,19 @@ case Hexagon::L2_loadri_io: case Hexagon::L2_loadrd_io: case Hexagon::V6_vL32b_ai: + case Hexagon::V6_vL32b_nt_ai: case Hexagon::V6_vL32b_ai_128B: + case Hexagon::V6_vL32b_nt_ai_128B: case Hexagon::V6_vL32Ub_ai: case Hexagon::V6_vL32Ub_ai_128B: case Hexagon::LDriw_pred: case Hexagon::LDriw_mod: case Hexagon::PS_vloadrq_ai: case Hexagon::PS_vloadrw_ai: + case Hexagon::PS_vloadrw_nt_ai: case Hexagon::PS_vloadrq_ai_128B: - case Hexagon::PS_vloadrw_ai_128B: { + case Hexagon::PS_vloadrw_ai_128B: + case Hexagon::PS_vloadrw_nt_ai_128B: { const MachineOperand OpFI = MI.getOperand(1); if (!OpFI.isFI()) return 0; @@ -2445,20 +2449,28 @@ switch (Opcode) { case Hexagon::PS_vstorerq_ai: case Hexagon::PS_vstorerw_ai: + case Hexagon::PS_vstorerw_nt_ai: case Hexagon::PS_vloadrq_ai: case Hexagon::PS_vloadrw_ai: + case Hexagon::PS_vloadrw_nt_ai: case Hexagon::V6_vL32b_ai: case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vL32b_nt_ai: + case Hexagon::V6_vS32b_nt_ai: case Hexagon::V6_vL32Ub_ai: case Hexagon::V6_vS32Ub_ai: return isShiftedInt<4,6>(Offset); case Hexagon::PS_vstorerq_ai_128B: case Hexagon::PS_vstorerw_ai_128B: + case Hexagon::PS_vstorerw_nt_ai_128B: case Hexagon::PS_vloadrq_ai_128B: case Hexagon::PS_vloadrw_ai_128B: + case Hexagon::PS_vloadrw_nt_ai_128B: case Hexagon::V6_vL32b_ai_128B: case Hexagon::V6_vS32b_ai_128B: + case Hexagon::V6_vL32b_nt_ai_128B: + case Hexagon::V6_vS32b_nt_ai_128B: case Hexagon::V6_vL32Ub_ai_128B: case Hexagon::V6_vS32Ub_ai_128B: return isShiftedInt<4,7>(Offset); @@ -3170,11 +3182,19 @@ return Hexagon::V6_vL32b_cur_pi; case Hexagon::V6_vL32b_ai: return Hexagon::V6_vL32b_cur_ai; + case Hexagon::V6_vL32b_nt_pi: + return Hexagon::V6_vL32b_nt_cur_pi; + case Hexagon::V6_vL32b_nt_ai: + return Hexagon::V6_vL32b_nt_cur_ai; //128B case Hexagon::V6_vL32b_pi_128B: return Hexagon::V6_vL32b_cur_pi_128B; case Hexagon::V6_vL32b_ai_128B: return Hexagon::V6_vL32b_cur_ai_128B; + case Hexagon::V6_vL32b_nt_pi_128B: + return Hexagon::V6_vL32b_nt_cur_pi_128B; + case Hexagon::V6_vL32b_nt_ai_128B: + return Hexagon::V6_vL32b_nt_cur_ai_128B; } return 0; } @@ -3187,11 +3207,19 @@ return Hexagon::V6_vL32b_pi; case Hexagon::V6_vL32b_cur_ai: return Hexagon::V6_vL32b_ai; + case Hexagon::V6_vL32b_nt_cur_pi: + return Hexagon::V6_vL32b_nt_pi; + case Hexagon::V6_vL32b_nt_cur_ai: + return Hexagon::V6_vL32b_nt_ai; //128B case Hexagon::V6_vL32b_cur_pi_128B: return Hexagon::V6_vL32b_pi_128B; case Hexagon::V6_vL32b_cur_ai_128B: return Hexagon::V6_vL32b_ai_128B; + case Hexagon::V6_vL32b_nt_cur_pi_128B: + return Hexagon::V6_vL32b_nt_pi_128B; + case Hexagon::V6_vL32b_nt_cur_ai_128B: + return Hexagon::V6_vL32b_nt_ai_128B; } return 0; } Index: lib/Target/Hexagon/HexagonPatterns.td =================================================================== --- lib/Target/Hexagon/HexagonPatterns.td +++ lib/Target/Hexagon/HexagonPatterns.td @@ -2770,6 +2770,9 @@ multiclass vS32b_ai_pats { // Aligned stores + def : Pat<(alignednontemporalstore (VTSgl VectorRegs:$src1), IntRegs:$addr), + (V6_vS32b_nt_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr), (V6_vS32b_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>, Requires<[UseHVXSgl]>; @@ -2778,6 +2781,9 @@ Requires<[UseHVXSgl]>; // 128B Aligned stores + def : Pat<(alignednontemporalstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr), + (V6_vS32b_nt_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr), (V6_vS32b_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>, Requires<[UseHVXDbl]>; @@ -2787,6 +2793,11 @@ // Fold Add R+OFF into vector store. let AddedComplexity = 10 in { + def : Pat<(alignednontemporalstore (VTSgl VectorRegs:$src1), + (add IntRegs:$src2, Iss4_6:$offset)), + (V6_vS32b_nt_ai IntRegs:$src2, Iss4_6:$offset, + (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; def : Pat<(alignedstore (VTSgl VectorRegs:$src1), (add IntRegs:$src2, Iss4_6:$offset)), (V6_vS32b_ai IntRegs:$src2, Iss4_6:$offset, @@ -2799,6 +2810,11 @@ Requires<[UseHVXSgl]>; // Fold Add R+OFF into vector store 128B. + def : Pat<(alignednontemporalstore (VTDbl VectorRegs128B:$src1), + (add IntRegs:$src2, Iss4_7:$offset)), + (V6_vS32b_nt_ai_128B IntRegs:$src2, Iss4_7:$offset, + (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), (add IntRegs:$src2, Iss4_7:$offset)), (V6_vS32b_ai_128B IntRegs:$src2, Iss4_7:$offset, @@ -2820,6 +2836,9 @@ multiclass vL32b_ai_pats { // Aligned loads + def : Pat < (VTSgl (alignednontemporalload IntRegs:$addr)), + (V6_vL32b_nt_ai IntRegs:$addr, 0) >, + Requires<[UseHVXSgl]>; def : Pat < (VTSgl (alignedload IntRegs:$addr)), (V6_vL32b_ai IntRegs:$addr, 0) >, Requires<[UseHVXSgl]>; @@ -2828,6 +2847,9 @@ Requires<[UseHVXSgl]>; // 128B Load + def : Pat < (VTDbl (alignednontemporalload IntRegs:$addr)), + (V6_vL32b_nt_ai_128B IntRegs:$addr, 0) >, + Requires<[UseHVXDbl]>; def : Pat < (VTDbl (alignedload IntRegs:$addr)), (V6_vL32b_ai_128B IntRegs:$addr, 0) >, Requires<[UseHVXDbl]>; @@ -2837,6 +2859,9 @@ // Fold Add R+OFF into vector load. let AddedComplexity = 10 in { + def : Pat<(VTDbl (alignednontemporalload (add IntRegs:$src2, Iss4_7:$offset))), + (V6_vL32b_nt_ai_128B IntRegs:$src2, Iss4_7:$offset)>, + Requires<[UseHVXDbl]>; def : Pat<(VTDbl (alignedload (add IntRegs:$src2, Iss4_7:$offset))), (V6_vL32b_ai_128B IntRegs:$src2, Iss4_7:$offset)>, Requires<[UseHVXDbl]>; @@ -2844,6 +2869,9 @@ (V6_vL32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset)>, Requires<[UseHVXDbl]>; + def : Pat<(VTSgl (alignednontemporalload (add IntRegs:$src2, Iss4_6:$offset))), + (V6_vL32b_nt_ai IntRegs:$src2, Iss4_6:$offset)>, + Requires<[UseHVXSgl]>; def : Pat<(VTSgl (alignedload (add IntRegs:$src2, Iss4_6:$offset))), (V6_vL32b_ai IntRegs:$src2, Iss4_6:$offset)>, Requires<[UseHVXSgl]>; @@ -2859,6 +2887,9 @@ defm : vL32b_ai_pats ; multiclass STrivv_pats { + def : Pat<(alignednontemporalstore (VTSgl VecDblRegs:$src1), IntRegs:$addr), + (PS_vstorerw_nt_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>, + Requires<[UseHVXSgl]>; def : Pat<(alignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr), (PS_vstorerw_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>, Requires<[UseHVXSgl]>; @@ -2866,6 +2897,10 @@ (PS_vstorerwu_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>, Requires<[UseHVXSgl]>; + def : Pat<(alignednontemporalstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr), + (PS_vstorerw_nt_ai_128B IntRegs:$addr, 0, + (VTDbl VecDblRegs128B:$src1))>, + Requires<[UseHVXDbl]>; def : Pat<(alignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr), (PS_vstorerw_ai_128B IntRegs:$addr, 0, (VTDbl VecDblRegs128B:$src1))>, @@ -2882,6 +2917,9 @@ defm : STrivv_pats ; multiclass LDrivv_pats { + def : Pat<(VTSgl (alignednontemporalload I32:$addr)), + (PS_vloadrw_nt_ai I32:$addr, 0)>, + Requires<[UseHVXSgl]>; def : Pat<(VTSgl (alignedload I32:$addr)), (PS_vloadrw_ai I32:$addr, 0)>, Requires<[UseHVXSgl]>; @@ -2889,6 +2927,9 @@ (PS_vloadrwu_ai I32:$addr, 0)>, Requires<[UseHVXSgl]>; + def : Pat<(VTDbl (alignednontemporalload I32:$addr)), + (PS_vloadrw_nt_ai_128B I32:$addr, 0)>, + Requires<[UseHVXDbl]>; def : Pat<(VTDbl (alignedload I32:$addr)), (PS_vloadrw_ai_128B I32:$addr, 0)>, Requires<[UseHVXDbl]>; Index: lib/Target/Hexagon/HexagonPseudo.td =================================================================== --- lib/Target/Hexagon/HexagonPseudo.td +++ lib/Target/Hexagon/HexagonPseudo.td @@ -407,6 +407,11 @@ def PS_vstorerw_ai_128B: STrivv_template, Requires<[HasV60T,UseHVXDbl]>; +def PS_vstorerw_nt_ai: STrivv_template, + Requires<[HasV60T,UseHVXSgl]>; +def PS_vstorerw_nt_ai_128B: STrivv_template, + Requires<[HasV60T,UseHVXDbl]>; + def PS_vstorerwu_ai: STrivv_template, Requires<[HasV60T,UseHVXSgl]>; def PS_vstorerwu_ai_128B: STrivv_template, @@ -433,6 +438,11 @@ def PS_vloadrw_ai_128B: LDrivv_template, Requires<[HasV60T,UseHVXDbl]>; +def PS_vloadrw_nt_ai: LDrivv_template, + Requires<[HasV60T,UseHVXSgl]>; +def PS_vloadrw_nt_ai_128B: LDrivv_template, + Requires<[HasV60T,UseHVXDbl]>; + def PS_vloadrwu_ai: LDrivv_template, Requires<[HasV60T,UseHVXSgl]>; def PS_vloadrwu_ai_128B: LDrivv_template, Index: test/CodeGen/Hexagon/hvx-nontemporal.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/hvx-nontemporal.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s +target triple = "hexagon" + +; Function Attrs: norecurse nounwind +define void @test(<32 x i32>* nocapture readonly %x, <32 x i32>* nocapture readnone %y, <32 x i32>* nocapture %a, <32 x i32>* nocapture %b) #0 { +entry: +; CHECK: v0 = vmem(r0+#7):nt + %add.ptr = getelementptr inbounds <32 x i32>, <32 x i32>* %x, i32 7 + %0 = load <32 x i32>, <32 x i32>* %add.ptr, align 128, !tbaa !1, !nontemporal !4 + +; CHECK: v1.cur = vmem(r2+#0):nt + %1 = load <32 x i32>, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4 + +; CHECK: vmem(r3+#3):nt = v1 + %add.ptr2 = getelementptr inbounds <32 x i32>, <32 x i32>* %b, i32 3 + store <32 x i32> %1, <32 x i32>* %add.ptr2, align 128, !tbaa !1, !nontemporal !4 + +; CHECK: vmem(r2+#0):nt = v0 + store <32 x i32> %0, <32 x i32>* %a, align 128, !tbaa !1, !nontemporal !4 + ret void +} + +attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" } + +!1 = !{!2, !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} +!4 = !{i32 1}