Index: cfe/trunk/include/clang/Basic/BuiltinsX86.def =================================================================== --- cfe/trunk/include/clang/Basic/BuiltinsX86.def +++ cfe/trunk/include/clang/Basic/BuiltinsX86.def @@ -313,7 +313,6 @@ TARGET_BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "", "sse") TARGET_BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "", "sse") TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "", "sse") -TARGET_BUILTIN(__builtin_ia32_movntps, "vf*V4f", "", "sse") TARGET_BUILTIN(__builtin_ia32_sfence, "v", "", "sse") TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "", "sse") TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "", "sse") @@ -327,8 +326,6 @@ TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "", "sse2") TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "", "sse2") TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "", "sse2") -TARGET_BUILTIN(__builtin_ia32_movntpd, "vd*V2d", "", "sse2") -TARGET_BUILTIN(__builtin_ia32_movntdq, "vV2LLi*V2LLi", "", "sse2") TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2LLiV16cV16c", "", "sse2") TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "", "sse2") TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "", "sse2") @@ -493,9 +490,6 @@ TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "", "avx") TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "", "avx") TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "", "avx") -TARGET_BUILTIN(__builtin_ia32_movntdq256, "vV4LLi*V4LLi", "", "avx") -TARGET_BUILTIN(__builtin_ia32_movntpd256, "vd*V4d", "", "avx") -TARGET_BUILTIN(__builtin_ia32_movntps256, "vf*V8f", "", "avx") TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2LLi", "", "avx") TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "", "avx") TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4LLi", "", "avx") @@ -2154,10 +2148,7 @@ TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f") TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f") -TARGET_BUILTIN(__builtin_ia32_movntdq512, "vV8LLi*V8LLi","","avx512f") TARGET_BUILTIN(__builtin_ia32_movntdqa512, "V8LLiV8LLi*","","avx512f") -TARGET_BUILTIN(__builtin_ia32_movntpd512, "vd*V8d","","avx512f") -TARGET_BUILTIN(__builtin_ia32_movntps512, "vf*V16f","","avx512f") TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw") TARGET_BUILTIN(__builtin_ia32_palignr128_mask, "V16cV16cV16cIiV16cUs","","avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_palignr256_mask, "V32cV32cV32cIiV32cUi","","avx512bw,avx512vl") Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -243,14 +243,14 @@ // little-Endian, the high bits in big-Endian. Therefore, on big-Endian // we need to shift the high bits down to the low before truncating. Width >>= 1; - if (CGF.getTarget().isBigEndian()) { - Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width); - V = CGF.Builder.CreateLShr(V, ShiftCst); - } - // We are truncating value in order to extract the higher-order - // double, which we will be using to extract the sign from. - IntTy = llvm::IntegerType::get(C, Width); - V = CGF.Builder.CreateTrunc(V, IntTy); + if (CGF.getTarget().isBigEndian()) { + Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width); + V = CGF.Builder.CreateLShr(V, ShiftCst); + } + // We are truncating value in order to extract the higher-order + // double, which we will be using to extract the sign from. + IntTy = llvm::IntegerType::get(C, Width); + V = CGF.Builder.CreateTrunc(V, IntTy); } Value *Zero = llvm::Constant::getNullValue(IntTy); return CGF.Builder.CreateICmpSLT(V, Zero); @@ -1815,13 +1815,13 @@ case Builtin::BI__builtin_smull_overflow: case Builtin::BI__builtin_smulll_overflow: IntrinsicId = llvm::Intrinsic::smul_with_overflow; - break; - } - - - llvm::Value *Carry; - llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry); - Builder.CreateStore(Sum, SumOutPtr); + break; + } + + + llvm::Value *Carry; + llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry); + Builder.CreateStore(Sum, SumOutPtr); return RValue::get(Carry); } @@ -3569,13 +3569,13 @@ llvm::Type *ResTy, unsigned IntID, const char *Name) { SmallVector TblOps; - if (ExtOp) - TblOps.push_back(ExtOp); - - // Build a vector containing sequential number like (0, 1, 2, ..., 15) - SmallVector Indices; - llvm::VectorType *TblTy = cast(Ops[0]->getType()); - for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { + if (ExtOp) + TblOps.push_back(ExtOp); + + // Build a vector containing sequential number like (0, 1, 2, ..., 15) + SmallVector Indices; + llvm::VectorType *TblTy = cast(Ops[0]->getType()); + for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { Indices.push_back(2*i); Indices.push_back(2*i+1); } @@ -3596,13 +3596,13 @@ ZeroTbl, Indices, Name)); } - Function *TblF; - TblOps.push_back(IndexOp); - TblF = CGF.CGM.getIntrinsic(IntID, ResTy); - - return CGF.EmitNeonCall(TblF, TblOps, Name); -} - + Function *TblF; + TblOps.push_back(IndexOp); + TblF = CGF.CGM.getIntrinsic(IntID, ResTy); + + return CGF.EmitNeonCall(TblF, TblOps, Name); +} + Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) { unsigned Value; switch (BuiltinID) { @@ -4102,13 +4102,13 @@ "vsha1h"); // The ARM _MoveToCoprocessor builtins put the input register value as - // the first argument, but the LLVM intrinsic expects it as the third one. - case ARM::BI_MoveToCoprocessor: - case ARM::BI_MoveToCoprocessor2: { - Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? - Intrinsic::arm_mcr : Intrinsic::arm_mcr2); - return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0], - Ops[3], Ops[4], Ops[5]}); + // the first argument, but the LLVM intrinsic expects it as the third one. + case ARM::BI_MoveToCoprocessor: + case ARM::BI_MoveToCoprocessor2: { + Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? + Intrinsic::arm_mcr : Intrinsic::arm_mcr2); + return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0], + Ops[3], Ops[4], Ops[5]}); } } @@ -6701,39 +6701,27 @@ if (Ops.size() == 3) return Align; - return EmitX86Select(*this, Ops[4], Align, Ops[3]); - } - - case X86::BI__builtin_ia32_movntps: - case X86::BI__builtin_ia32_movntps256: - case X86::BI__builtin_ia32_movntpd: - case X86::BI__builtin_ia32_movntpd256: - case X86::BI__builtin_ia32_movntdq: - case X86::BI__builtin_ia32_movntdq256: - case X86::BI__builtin_ia32_movnti: - case X86::BI__builtin_ia32_movnti64: { - llvm::MDNode *Node = llvm::MDNode::get( + return EmitX86Select(*this, Ops[4], Align, Ops[3]); + } + + case X86::BI__builtin_ia32_movnti: + case X86::BI__builtin_ia32_movnti64: { + llvm::MDNode *Node = llvm::MDNode::get( getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); // Convert the type of the pointer to a pointer to the stored type. Value *BC = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType()), "cast"); - StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC); - SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node); - - // If the operand is an integer, we can't assume alignment. Otherwise, - // assume natural alignment. - QualType ArgTy = E->getArg(1)->getType(); - unsigned Align; - if (ArgTy->isIntegerType()) - Align = 1; - else - Align = getContext().getTypeSizeInChars(ArgTy).getQuantity(); - SI->setAlignment(Align); - return SI; - } - case X86::BI__builtin_ia32_selectb_128: + StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC); + SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node); + + // No alignment for scalar intrinsic store. + QualType ArgTy = E->getArg(1)->getType(); + SI->setAlignment(1); + return SI; + } + case X86::BI__builtin_ia32_selectb_128: case X86::BI__builtin_ia32_selectb_256: case X86::BI__builtin_ia32_selectb_512: case X86::BI__builtin_ia32_selectw_128: Index: cfe/trunk/lib/Headers/avx512fintrin.h =================================================================== --- cfe/trunk/lib/Headers/avx512fintrin.h +++ cfe/trunk/lib/Headers/avx512fintrin.h @@ -8866,7 +8866,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm512_stream_si512 (__m512i * __P, __m512i __A) { - __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A); + __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -8878,13 +8878,13 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm512_stream_pd (double *__P, __m512d __A) { - __builtin_ia32_movntpd512 (__P, (__v8df) __A); + __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P); } static __inline__ void __DEFAULT_FN_ATTRS _mm512_stream_ps (float *__P, __m512 __A) { - __builtin_ia32_movntps512 (__P, (__v16sf) __A); + __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P); } static __inline__ __m512d __DEFAULT_FN_ATTRS Index: cfe/trunk/lib/Headers/avxintrin.h =================================================================== --- cfe/trunk/lib/Headers/avxintrin.h +++ cfe/trunk/lib/Headers/avxintrin.h @@ -2496,19 +2496,19 @@ static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b) { - __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b); + __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a); } static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b) { - __builtin_ia32_movntpd256(__a, (__v4df)__b); + __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a); } static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a) { - __builtin_ia32_movntps256(__p, (__v8sf)__a); + __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p); } /* Create vectors */ Index: cfe/trunk/lib/Headers/emmintrin.h =================================================================== --- cfe/trunk/lib/Headers/emmintrin.h +++ cfe/trunk/lib/Headers/emmintrin.h @@ -2210,13 +2210,13 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a) { - __builtin_ia32_movntpd(__p, (__v2df)__a); + __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); } static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a) { - __builtin_ia32_movntdq(__p, (__v2di)__a); + __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); } static __inline__ void __DEFAULT_FN_ATTRS Index: cfe/trunk/lib/Headers/xmmintrin.h =================================================================== --- cfe/trunk/lib/Headers/xmmintrin.h +++ cfe/trunk/lib/Headers/xmmintrin.h @@ -2080,7 +2080,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a) { - __builtin_ia32_movntps(__p, (__v4sf)__a); + __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); } /// \brief Forces strong memory ordering (serialization) between store Index: cfe/trunk/test/CodeGen/avx512f-builtins.c =================================================================== --- cfe/trunk/test/CodeGen/avx512f-builtins.c +++ cfe/trunk/test/CodeGen/avx512f-builtins.c @@ -5800,7 +5800,7 @@ void test_mm512_stream_si512(__m512i * __P, __m512i __A) { // CHECK-LABEL: @test_mm512_stream_si512 - // CHECK: @llvm.x86.avx512.storent.q.512 + // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal _mm512_stream_si512(__P, __A); } @@ -5812,13 +5812,13 @@ void test_mm512_stream_pd(double *__P, __m512d __A) { // CHECK-LABEL: @test_mm512_stream_pd - // CHECK: @llvm.x86.avx512.storent.pd.512 + // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 64, !nontemporal return _mm512_stream_pd(__P, __A); } void test_mm512_stream_ps(float *__P, __m512 __A) { // CHECK-LABEL: @test_mm512_stream_ps - // CHECK: @llvm.x86.avx512.storent.ps.512 + // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 64, !nontemporal _mm512_stream_ps(__P, __A); } Index: cfe/trunk/test/CodeGen/builtins-x86.c =================================================================== --- cfe/trunk/test/CodeGen/builtins-x86.c +++ cfe/trunk/test/CodeGen/builtins-x86.c @@ -300,7 +300,6 @@ (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); tmp_i = __builtin_ia32_movmskps(tmp_V4f); tmp_i = __builtin_ia32_pmovmskb(tmp_V8c); - (void) __builtin_ia32_movntps(tmp_fp, tmp_V4f); (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi); (void) __builtin_ia32_sfence(); @@ -318,8 +317,6 @@ #ifdef USE_64 (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi); #endif - (void) __builtin_ia32_movntpd(tmp_dp, tmp_V2d); - (void) __builtin_ia32_movntdq(tmp_V2LLip, tmp_V2LLi); tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d); tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d); @@ -446,9 +443,6 @@ tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp); tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp); tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp); - __builtin_ia32_movntdq256(tmp_V4LLip, tmp_V4LLi); - __builtin_ia32_movntpd256(tmp_dp, tmp_V4d); - __builtin_ia32_movntps256(tmp_fp, tmp_V8f); tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi); tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i); tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);