diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -52,19 +52,149 @@
     endif()
   endif()
   if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    list(APPEND compile_options "-fopenmp")
-    list(APPEND compile_options "-fopenmp-cuda-mode")
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
-      list(APPEND compile_options "--offload-arch=${gpu_arch}")
-    endforeach()
     list(APPEND compile_options "-nogpulib")
-    list(APPEND compile_options "-nogpuinc")
     list(APPEND compile_options "-fvisibility=hidden")
-    list(APPEND compile_options "-foffload-lto")
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
+# Builds the entrypoint target for the GPU.
+# Usage:
+#     _build_gpu_entrypoint_objects(
+#       <target_name>
+#       SRCS <list of .cpp files>
+#       HDRS <list of .h files>
+#       DEPENDS <list of dependencies>
+#       COMPILE_OPTIONS <optional list of special compile options for this target>
+#       FLAGS <optional list of flags>
+#     )
+function(_build_gpu_entrypoint_objects fq_target_name)
+  cmake_parse_arguments(
+    "ADD_GPU_ENTRYPOINT_OBJ"
+    "" # No optional arguments
+    "NAME;CXX_STANDARD" # Single value arguments
+    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
+    ${ARGN}
+  )
+
+  # The packaged version will be built for every target GPU architecture. We do
+  # this so we can support multiple accelerators on the same machine.
+  foreach(gpu_arch ${all_gpu_architectures})
+    set(gpu_target_name ${fq_target_name}.${gpu_arch})
+    set(compile_options ${ADD_GPU_ENTRYPOINT_OBJ_COMPILE_OPTIONS})
+    # Derive the triple from the specified architecture.
+    if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
+      set(gpu_target_triple "amdgcn-amd-amdhsa")
+      list(APPEND compile_options "-mcpu=${gpu_arch}")
+    elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
+      set(gpu_target_triple "nvptx64-nvidia-cuda")
+      list(APPEND compile_options "-march=${gpu_arch}")
+    else()
+      message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
+    endif()
+    list(APPEND compile_options "--target=${gpu_target_triple}")
+    list(APPEND compile_options "-emit-llvm")
+
+    # Build the library for this target architecture. We always emit LLVM-IR for
+    # packaged GPU binaries.
+    add_library(${gpu_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_GPU_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_GPU_ENTRYPOINT_OBJ_HDRS}
+    )
+
+    target_compile_options(${gpu_target_name} PRIVATE ${compile_options})
+    target_include_directories(${gpu_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${gpu_target_name} ${ADD_GPU_ENTRYPOINT_OBJ_DEPENDS})
+    target_compile_definitions(${gpu_target_name} PRIVATE LLVM_LIBC_PUBLIC_PACKAGING)
+
+    # Append this target to a list of images to package into a single binary.
+    set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
+    list(APPEND packager_images
+         --image=file=${input_file},arch=${gpu_arch},triple=${gpu_target_triple})
+    list(APPEND gpu_target_names ${gpu_target_name})
+  endforeach()
+
+  # After building the target for the desired GPUs we must package the output
+  # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
+  # more information.
+  set(packaged_target_name ${fq_target_name}.__gpu__)
+  set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.gpubin)
+
+  add_custom_command(OUTPUT ${packaged_output_name}
+                     COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+                             ${packager_images} -o ${packaged_output_name}
+                     DEPENDS ${gpu_target_names}
+                     COMMENT "Packaging LLVM offloading binary")
+  add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
+
+  # We create an empty 'stub' file for the host to contain the embedded device
+  # code. This will be packaged into 'libcgpu.a'.
+  # TODO: In the future we will want to combine every architecture for a target
+  #       into a single bitcode file and use that. For now we simply build for
+  #       every single one and let the offloading linker handle it.
+  get_filename_component(stub_filename ${ADD_GPU_ENTRYPOINT_OBJ_SRCS} NAME)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${stub_filename} "// Empty file.\n")
+  add_library(
+    ${fq_target_name}
+    # We want an object library as the objects will eventually get packaged into
+    # an archive (like libcgpu.a).
+    EXCLUDE_FROM_ALL
+    OBJECT
+    "${CMAKE_CURRENT_BINARY_DIR}/${stub_filename}"
+  )
+  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options}
+                         -DLLVM_LIBC_PUBLIC_PACKAGING
+                         -nostdlib -Xclang -fembed-offload-object=${packaged_output_name})
+  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
+  add_dependencies(${fq_target_name} ${full_deps_list} ${packaged_target_name})
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
+      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+      OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
+      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+      DEPS "${fq_deps_list}"
+      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+  )
+
+  # We only build the internal target for a single supported architecture.
+  set(internal_target_name ${fq_target_name}.__internal__)
+  set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR})
+  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU OR
+     LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+    add_library(
+      ${internal_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${internal_target_name} BEFORE PRIVATE
+                           ${common_compile_options} --target=${LIBC_GPU_TARGET_TRIPLE})
+    if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+      target_compile_options(${internal_target_name} PRIVATE -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE})
+    elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+      target_compile_options(${internal_target_name} PRIVATE -march=${LIBC_GPU_TARGET_ARCHITECTURE})
+    endif()
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${internal_target_name} ${full_deps_list})
+    set_target_properties(
+      ${internal_target_name}
+      PROPERTIES
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
+    set_target_properties(
+      ${fq_target_name}
+      PROPERTIES OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
+    )
+  endif()
+endfunction()
+
 # Rule which is essentially a wrapper over add_library to compile a set of
 # sources to object files.
 # Usage:
@@ -127,7 +257,6 @@
   if(NOT ADD_OBJECT_CXX_STANDARD)
     set(ADD_OBJECT_CXX_STANDARD ${CMAKE_CXX_STANDARD})
   endif()
-  
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
@@ -350,53 +479,67 @@
     endif()
   endif()
 
-  add_library(
-    ${internal_target_name}
-    # TODO: We don't need an object library for internal consumption.
-    # A future change should switch this to a normal static library.
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_ENTRYPOINT_OBJ_SRCS}
-    ${ADD_ENTRYPOINT_OBJ_HDRS}
-  )
-  target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-  target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
-  add_dependencies(${internal_target_name} ${full_deps_list})
-  set_target_properties(
-    ${internal_target_name}
-    PROPERTIES
+  # GPU builds require special handling for the objects because we want to
+  # export several different targets at once, e.g. for both Nvidia and AMD.
+  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+    _build_gpu_entrypoint_objects(
+      ${fq_target_name}
+      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
+      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
+      COMPILE_OPTIONS ${common_compile_options}
+      DEPENDS ${full_deps_list}
       CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
       FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-  )
+    )
+  else()
+    add_library(
+      ${internal_target_name}
+      # TODO: We don't need an object library for internal consumption.
+      # A future change should switch this to a normal static library.
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${internal_target_name} ${full_deps_list})
+    set_target_properties(
+      ${internal_target_name}
+      PROPERTIES
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
 
-  add_library(
-    ${fq_target_name}
-    # We want an object library as the objects will eventually get packaged into
-    # an archive (like libc.a).
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_ENTRYPOINT_OBJ_SRCS}
-    ${ADD_ENTRYPOINT_OBJ_HDRS}
-  )
-  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
-  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
-  add_dependencies(${fq_target_name} ${full_deps_list})
+    add_library(
+      ${fq_target_name}
+      # We want an object library as the objects will eventually get packaged into
+      # an archive (like libc.a).
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
+    target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${fq_target_name} ${full_deps_list})
 
-  set_target_properties(
-    ${fq_target_name}
-    PROPERTIES
-      ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
-      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
-      OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
-      # TODO: We don't need to list internal object files if the internal
-      # target is a normal static library.
-      OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPS "${fq_deps_list}"
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-  )
+    set_target_properties(
+      ${fq_target_name}
+      PROPERTIES
+        ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
+        TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+        OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
+        # TODO: We don't need to list internal object files if the internal
+        # target is a normal static library.
+        OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        DEPS "${fq_deps_list}"
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
+  endif()
 
-  if(LLVM_LIBC_ENABLE_LINTING)
+  if(LLVM_LIBC_ENABLE_LINTING AND TARGET ${internal_target_name})
     if(NOT LLVM_LIBC_CLANG_TIDY)
       message(FATAL_ERROR "Something is wrong!  LLVM_LIBC_ENABLE_LINTING is "
               "ON but LLVM_LIBC_CLANG_TIDY is not set.")
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -4,12 +4,14 @@
 endif()
 
 # Set up the target architectures to build the GPU libc for.
-set(all_gpu_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
-                          "sm_70;sm_72;sm_75;sm_80;sm_86;gfx700;gfx701;gfx801;"
-                          "gfx803;gfx900;gfx902;gfx906;gfx908;gfx90a;gfx90c;"
-                          "gfx940;gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;"
-                          "gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;"
-                          "gfx1103")
+set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906;"
+                             "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030;"
+                             "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;"
+                             "gfx1100;gfx1101;gfx1102;gfx1103")
+set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
+                            "sm_70;sm_72;sm_75;sm_80;sm_86")
+set(all_gpu_architectures
+    "${all_amdgpu_architectures};${all_nvptx_architectures}")
 set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures} CACHE STRING
     "List of GPU architectures to build the libc for.")
 if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
@@ -29,6 +31,15 @@
                       "GPU.")
 endif()
 
+# Identify the program used to package multiple images into a single binary.
+find_program(LIBC_CLANG_OFFLOAD_PACKAGER
+             NAMES clang-offload-packager
+             PATHS ${LLVM_BINARY_DIR}/bin)
+if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
+  message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
+                      "build")
+endif()
+
 # Identify any locally installed AMD GPUs on the system to use for testing.
 find_program(LIBC_AMDGPU_ARCH
              NAMES amdgpu-arch
diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -29,17 +29,15 @@
 #define LIBC_INLINE inline
 #endif
 
-// We use OpenMP to declare these functions on the device.
-#define STR(X) #X
-#define LLVM_LIBC_DECLARE_DEVICE(name)                                         \
-  _Pragma(STR(omp declare target to(name) device_type(nohost)))
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+#define PACKAGE_FOR_GPU
+#endif
 
-// GPU targets do not support aliasing and must be declared on the device.
-#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(_OPENMP)
+// GPU targets do not support aliasing.
+#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(PACKAGE_FOR_GPU)
 #define LLVM_LIBC_FUNCTION(type, name, arglist)                                \
   LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name)                          \
       __##name##_impl__ __asm__(#name);                                        \
-  LLVM_LIBC_DECLARE_DEVICE(__##name##_impl__)                                  \
   type __##name##_impl__ arglist
 // MacOS needs to be excluded because it does not support aliasing.
 #elif defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))