Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -785,12 +785,6 @@ [IntrNoMem, Commutative]>; } -// Cacheability support ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa">, - Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty], [IntrReadMem]>; -} - // Test instruction with bitwise comparison. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_ptestz : GCCBuiltin<"__builtin_ia32_ptestz128">, @@ -2346,8 +2340,6 @@ def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">, - Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>; } //===----------------------------------------------------------------------===// @@ -6345,10 +6337,6 @@ GCCBuiltin<"__builtin_ia32_cmpsd_mask">, Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_x86_avx512_movntdqa : - GCCBuiltin<"__builtin_ia32_movntdqa512">, - Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>; } //===----------------------------------------------------------------------===// Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -202,6 +202,9 @@ Name.startswith("sse4a.movnt.") || // Added in 3.9 Name.startswith("avx.movnt.") || // Added in 3.2 Name.startswith("avx512.storent.") || // Added in 3.9 + Name == "sse41.movntdqa" || // Added in 5.0 + Name == "avx2.movntdqa" || // Added in 5.0 + Name == "avx512.movntdqa" || // Added in 5.0 Name == "sse2.storel.dq" || // Added in 3.9 Name.startswith("sse.storeu.") || // Added in 3.9 Name.startswith("sse2.storeu.") || // Added in 3.9 @@ -1870,6 +1873,22 @@ { CI->getArgOperand(0), CI->getArgOperand(1) }); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (IsX86 && Name.endswith(".movntdqa")) { + Module *M = F->getParent(); + SmallVector Elts; + Elts.push_back( + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1))); + MDNode *Node = MDNode::get(C, Elts); + + Value *Ptr = CI->getArgOperand(0); + VectorType *VTy = cast(CI->getType()); + + // Convert the type of the pointer to a pointer to the stored type. + Value *BC = + Builder.CreateBitCast(Ptr, PointerType::getUnqual(VTy), "cast"); + LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8); + LI->setMetadata(M->getMDKindID("nontemporal"), Node); + Rep = LI; } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) { Value *Arg = CI->getArgOperand(0); Value *Neg = Builder.CreateNeg(Arg, "neg"); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3633,23 +3633,20 @@ let SchedRW = [WriteLoad] in { def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], - SSEPackedInt>, EVEX, T8PD, EVEX_V512, + [], SSEPackedInt>, EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>; let Predicates = [HasVLX] in { def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), (ins i256mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))], - SSEPackedInt>, EVEX, T8PD, EVEX_V256, + [], SSEPackedInt>, EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>; def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))], - SSEPackedInt>, EVEX, T8PD, EVEX_V128, + [], SSEPackedInt>, EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>; } } Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -7095,17 +7095,14 @@ let SchedRW = [WriteLoad] in { let Predicates = [HasAVX, NoVLX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, + "vmovntdqa\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; let Predicates = [HasAVX2, NoVLX] in def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>, + "vmovntdqa\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L, VEX_WIG; def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movntdqa\t{$src, $dst|$dst, $src}", - [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>; + "movntdqa\t{$src, $dst|$dst, $src}", []>; } // SchedRW let Predicates = [HasAVX2, NoVLX] in { Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -34,6 +34,18 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone +define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) { +; CHECK-LABEL: test_x86_avx2_movntdqa: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: vmovntdqa (%eax), %ymm0 +; CHECK-NEXT: retl + %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly + + define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: test_x86_avx2_mpsadbw: ; CHECK: ## BB#0: @@ -370,7 +382,7 @@ ; CHECK-LABEL: test_x86_avx_storeu_dq_256: ; CHECK: ## BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0 +; CHECK-NEXT: vpaddb LCPI34_0, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, (%eax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -836,24 +836,6 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone -define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) { -; AVX2-LABEL: test_x86_avx2_movntdqa: -; AVX2: ## BB#0: -; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX2-NEXT: vmovntdqa (%eax), %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2a,0x00] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_movntdqa: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; AVX512VL-NEXT: vmovntdqa (%eax), %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2a,0x00] -; AVX512VL-NEXT: retl ## encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1] - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly - - define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: test_x86_avx2_mpsadbw: ; CHECK: ## BB#0: @@ -1358,18 +1340,18 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI91_0, kind: FK_Data_4 -; AVX2-NEXT: vpsravd LCPI91_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI91_1, kind: FK_Data_4 +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4 +; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovdqa LCPI91_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; AVX512VL-NEXT: vmovdqa LCPI90_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI91_0, kind: FK_Data_4 -; AVX512VL-NEXT: vpsravd LCPI91_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI91_1, kind: FK_Data_4 +; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4 +; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4 ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -1395,18 +1377,18 @@ ; AVX2: ## BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI93_0, kind: FK_Data_4 -; AVX2-NEXT: vpsravd LCPI93_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI93_1, kind: FK_Data_4 +; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4 +; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4 ; AVX2-NEXT: retl ## encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vmovdqa LCPI93_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; AVX512VL-NEXT: vmovdqa LCPI92_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI93_0, kind: FK_Data_4 -; AVX512VL-NEXT: vpsravd LCPI93_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI93_1, kind: FK_Data_4 +; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4 +; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4 ; AVX512VL-NEXT: retl ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -3061,3 +3061,14 @@ %res4 = add <8 x i64> %res2, %res3 ret <8 x i64> %res4 } + +define <8 x i64> @test_x86_avx512_movntdqa(i8* %a0) { +; CHECK-LABEL: test_x86_avx512_movntdqa: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovntdqa (%rdi), %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.movntdqa(i8* %a0) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) nounwind readonly Index: test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -59,6 +59,19 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone +define <2 x i64> @test_x86_sse41_movntdqa(<2 x i64>* %a0) { +; CHECK-LABEL: test_x86_sse41_movntdqa: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movntdqa (%eax), %xmm0 +; CHECK-NEXT: retl + %arg0 = bitcast <2 x i64>* %a0 to i8* + %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0) + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone + + define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: test_x86_sse41_mpsadbw: ; CHECK: ## BB#0: