diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12409,8 +12409,8 @@ static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef Ops, Align Alignment) { // Cast the pointer to right type. - Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], - llvm::PointerType::getUnqual(Ops[1]->getType())); + Value *Ptr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + Ops[0], llvm::PointerType::getUnqual(Ops[1]->getType())); Value *MaskVec = getMaskVecValue( CGF, Ops[2], @@ -12423,8 +12423,8 @@ Align Alignment) { // Cast the pointer to right type. llvm::Type *Ty = Ops[1]->getType(); - Value *Ptr = - CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(Ty)); + Value *Ptr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + Ops[0], llvm::PointerType::getUnqual(Ty)); Value *MaskVec = getMaskVecValue( CGF, Ops[2], cast(Ty)->getNumElements()); diff --git a/clang/test/CodeGen/X86/x86-builtins-intrinsic.cl b/clang/test/CodeGen/X86/x86-builtins-intrinsic.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/x86-builtins-intrinsic.cl @@ -0,0 +1,73 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -no-opaque-pointers %s -triple x86_64-pc-linux \ +// RUN: -emit-llvm -O0 -x cl -cl-std=CL2.0 -ffake-address-space-map \ +// RUN: -target-cpu skx -o - | FileCheck %s + +typedef int __v16si __attribute__((__vector_size__(64))); + +// CHECK-LABEL: @load( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[__U_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__P_ADDR:%.*]] = alloca i8 addrspace(4)*, align 8 +// CHECK-NEXT: [[DD:%.*]] = alloca <16 x i32>, align 64 +// CHECK-NEXT: store i16 [[__U:%.*]], i16* [[__U_ADDR]], align 2 +// CHECK-NEXT: store i8 addrspace(4)* [[__P:%.*]], i8 addrspace(4)** [[__P_ADDR]], align 8 +// CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[DD]], align 64 +// CHECK-NEXT: [[TMP0:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)** [[__P_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to i32 addrspace(4)* +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[DD]], align 64 +// CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[__U_ADDR]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(4)* [[TMP1]] to <16 x i32>* +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +// CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 1, <16 x i1> [[TMP5]], <16 x i32> [[TMP2]]) +// CHECK-NEXT: ret void +// +void load(unsigned short __U, void const *__P) +{ + __v16si dd = { 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0 }; + __builtin_ia32_loaddqusi512_mask(__P, dd, __U); +} + +// CHECK-LABEL: @store( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[__U_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__P_ADDR:%.*]] = alloca i8 addrspace(4)*, align 8 +// CHECK-NEXT: [[AA:%.*]] = alloca <16 x i32>, align 64 +// CHECK-NEXT: store i16 [[__U:%.*]], i16* [[__U_ADDR]], align 2 +// CHECK-NEXT: store i8 addrspace(4)* [[__P:%.*]], i8 addrspace(4)** [[__P_ADDR]], align 8 +// CHECK-NEXT: store <16 x i32> zeroinitializer, <16 x i32>* [[AA]], align 64 +// CHECK-NEXT: [[TMP0:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)** [[__P_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to i32 addrspace(4)* +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* [[AA]], align 64 +// CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[__U_ADDR]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = addrspacecast i32 addrspace(4)* [[TMP1]] to <16 x i32>* +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1> +// CHECK-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP2]], <16 x i32>* [[TMP4]], i32 1, <16 x i1> [[TMP5]]) +// CHECK-NEXT: ret void +// +void store(unsigned short __U, void const *__P) +{ + __v16si aa = { 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0 }; + __builtin_ia32_storedqusi512_mask((int*)__P, aa, __U); +} + +// CHECK-LABEL: @foo( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[PLOADSTORE_ADDR:%.*]] = alloca i8 addrspace(4)*, align 8 +// CHECK-NEXT: [[MASK_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store i8 addrspace(4)* [[PLOADSTORE:%.*]], i8 addrspace(4)** [[PLOADSTORE_ADDR]], align 8 +// CHECK-NEXT: store i16 [[MASK:%.*]], i16* [[MASK_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[MASK_ADDR]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)** [[PLOADSTORE_ADDR]], align 8 +// CHECK-NEXT: call void @load(i16 noundef zeroext [[TMP0]], i8 addrspace(4)* noundef [[TMP1]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[MASK_ADDR]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)** [[PLOADSTORE_ADDR]], align 8 +// CHECK-NEXT: call void @store(i16 noundef zeroext [[TMP2]], i8 addrspace(4)* noundef [[TMP3]]) #[[ATTR4]] +// CHECK-NEXT: ret void +// +void foo(void const *pLoadStore, unsigned short mask) +{ + load(mask, pLoadStore); + store(mask, pLoadStore); +} +