diff --git a/clang/test/CodeGenCUDA/memcpy-libcall.cu b/clang/test/CodeGenCUDA/memcpy-libcall.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -O3 -S %s -o - | FileCheck -check-prefix=PTX %s +// RUN: %clang_cc1 -x cuda -triple nvptx64-nvidia-cuda- -fcuda-is-device \ +// RUN: -Os -S %s -o - | FileCheck -check-prefix=PTX %s +#include "Inputs/cuda.h" + +// PTX-LABEL: .func _Z12copy_genericPvPKv( +void __device__ copy_generic(void *dest, const void *src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.u8 +// PTX: st.u8 +} + +// PTX-LABEL: .entry _Z11copy_globalPvS_( +void __global__ copy_global(void *dest, void * src) { + __builtin_memcpy(dest, src, 32); +// PTX: ld.global.u8 +// PTX: st.global.u8 +} + +struct S { + int data[8]; +}; + +// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_( +void __global__ copy_param_to_global(S *global, S param) { + __builtin_memcpy(global, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.global.u32 +} + +// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( +void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, + S param) { + __builtin_memcpy(local, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.local.u32 +} + +// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( +void __device__ copy_local_to_generic(S *generic, + __attribute__((address_space(5))) S *src) { + __builtin_memcpy(generic, src, sizeof(S)); +// PTX: ld.local.u32 +// PTX: st.u32 +} + +__shared__ S shared; + +// PTX-LABEL: .entry _Z20copy_param_to_shared1S( +void __global__ copy_param_to_shared( S param) { + __builtin_memcpy(&shared, ¶m, sizeof(S)); +// PTX: ld.param.u32 +// PTX: st.shared.u32 +} + +void __device__ copy_shared_to_generic(S *generic) { + __builtin_memcpy(generic, &shared, sizeof(S)); +// PTX: ld.shared.u32 +// PTX: st.u32 +} diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -386,9 +386,9 @@ // always lower memset, memcpy, and memmove intrinsics to load/store // instructions, rather // then generating calls to memset, mempcy or memmove. - MaxStoresPerMemset = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF; - MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);