Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -2030,6 +2030,18 @@ [IntrReadWriteArgMem]>; } +// Store ops using non-temporal hint +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx512_storent_q_512 : + GCCBuiltin<"__builtin_ia32_movntdq512">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_storent_pd_512 : + GCCBuiltin<"__builtin_ia32_movntpd512">, + Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty], [IntrReadWriteArgMem]>; + def int_x86_avx512_storent_ps_512 : + GCCBuiltin<"__builtin_ia32_movntps512">, + Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty], [IntrReadWriteArgMem]>; +} //===----------------------------------------------------------------------===// // AVX2 Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4229,10 +4229,11 @@ break; } case STOREA: + case STOREANT: case STOREU: { Info.ptrVal = I.getArgOperand(0); Info.memVT = MVT::getVT(I.getArgOperand(1)->getType()); - Info.align = (IntrData->Type == STOREA ? Info.memVT.getSizeInBits()/8 : 1); + Info.align = (IntrData->Type == STOREU ? 1 : Info.memVT.getSizeInBits()/8); Info.writeMem = true; break; } @@ -17739,6 +17740,20 @@ return DAG.getMaskedStore(Chain, dl, Data, Addr, VMask, VT, MemIntr->getMemOperand(), false); } + case STOREANT: { + // Store (MOVNTPD, MOVNTPS, MOVNTDQ) using non-temporal hint intrinsic implementation. + SDValue Data = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + MemIntrinsicSDNode *MemIntr = dyn_cast(Op); + assert(MemIntr && "Expected MemIntrinsicSDNode!"); + MachineMemOperand *MMO = MemIntr->getMemOperand(); + + MMO->setFlags(MachineMemOperand::MONonTemporal); + + return DAG.getStore(Chain, dl, Data, Addr, MMO); + } } } Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -3194,50 +3194,31 @@ } } -multiclass avx512_movnt opc, string OpcodeStr, PatFrag st_frag, - ValueType OpVT, RegisterClass RC, X86MemOperand memop, - Domain d, InstrItinClass itin = IIC_SSE_MOVNT> { +multiclass avx512_movnt opc, string OpcodeStr, X86VectorVTInfo _, + PatFrag st_frag = alignednontemporalstore, + InstrItinClass itin = IIC_SSE_MOVNT> { let SchedRW = [WriteStore], mayStore = 1, AddedComplexity = 400 in - def mr : AVX512PI, EVEX; + [(st_frag (_.VT _.RC:$src), addr:$dst)], + _.ExeDomain, itin>, EVEX, EVEX_CD8<_.EltSize, CD8VF>; } -multiclass avx512_movnt_vl opc, string OpcodeStr, PatFrag st_frag, - string elty, string elsz, string vsz512, - string vsz256, string vsz128, Domain d, - Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> { - let Predicates = [prd] in - defm Z : avx512_movnt("v"##vsz512##elty##elsz), VR512, - !cast(elty##"512mem"), d, itin>, - EVEX_V512; - - let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_movnt("v"##vsz256##elty##elsz), VR256X, - !cast(elty##"256mem"), d, itin>, - EVEX_V256; +multiclass avx512_movnt_vl opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z : avx512_movnt, EVEX_V512; - defm Z128 : avx512_movnt("v"##vsz128##elty##elsz), VR128X, - !cast(elty##"128mem"), d, itin>, - EVEX_V128; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_movnt, EVEX_V256; + defm Z128 : avx512_movnt, EVEX_V128; } } -defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore, - "i", "64", "8", "4", "2", SSEPackedInt, - HasAVX512>, PD, EVEX_CD8<64, CD8VF>; - -defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore, - "f", "64", "8", "4", "2", SSEPackedDouble, - HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, - "f", "32", "16", "8", "4", SSEPackedSingle, - HasAVX512>, PS, EVEX_CD8<32, CD8VF>; +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info>, PD; +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info>, PD, VEX_W; +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info>, PS; //===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -29,7 +29,7 @@ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, - EXPAND_FROM_MEM, LOADA, LOADU, STOREA, STOREU, BLEND, INSERT_SUBVEC, + EXPAND_FROM_MEM, LOADA, LOADU, STOREA, STOREU, STOREANT, BLEND, INSERT_SUBVEC, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; @@ -260,7 +260,9 @@ X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), - + X86_INTRINSIC_DATA(avx512_storent_pd_512, STOREANT, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_storent_ps_512, STOREANT, ISD::DELETED_NODE, 0), + X86_INTRINSIC_DATA(avx512_storent_q_512, STOREANT, ISD::DELETED_NODE, 0), X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -7176,3 +7176,35 @@ ret <2 x double> %res4 } +declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>) + +define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) { +; CHECK-LABEL: test_storent_q_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data) + ret void +} + +declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>) + +define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) { +; CHECK-LABEL: test_storent_pd_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovntpd %zmm0, (%rdi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data) + ret void +} + +declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>) + +define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) { +; CHECK-LABEL: test_storent_ps_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovntps %zmm0, (%rdi) +; CHECK-NEXT: retq + call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data) + ret void +}