Index: clang/lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3471,8 +3471,7 @@
     return false;
   const auto *AA = CVD->getAttr<OMPAllocateDeclAttr>();
   // Use the default allocation.
-  return !((AA->getAllocatorType() == OMPAllocateDeclAttr::OMPDefaultMemAlloc ||
-            AA->getAllocatorType() == OMPAllocateDeclAttr::OMPNullMemAlloc) &&
+  return !(AA->getAllocatorType() == OMPAllocateDeclAttr::OMPDefaultMemAlloc &&
            !AA->getAllocator());
 }
 
@@ -12240,6 +12239,26 @@
   return CGF.GetAddrOfLocalVar(NativeParam);
 }
 
+/// Return allocator value from expression, or return a null allocator (default
+/// when no allocator specified).
+static llvm::Value *getAllocatorVal(CodeGenFunction &CGF,
+                                    const Expr *Allocator) {
+  llvm::Value *AllocVal;
+  if (Allocator) {
+    AllocVal = CGF.EmitScalarExpr(Allocator);
+    // According to the standard, the original allocator type is a enum
+    // (integer). Convert to pointer type, if required.
+    AllocVal = CGF.EmitScalarConversion(AllocVal, Allocator->getType(),
+                                        CGF.getContext().VoidPtrTy,
+                                        Allocator->getExprLoc());
+  } else {
+    // If no allocator specified, it defaults to the null allocator.
+    AllocVal = llvm::Constant::getNullValue(
+        CGF.CGM.getTypes().ConvertType(CGF.getContext().VoidPtrTy));
+  }
+  return AllocVal;
+}
+
 Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF,
                                                    const VarDecl *VD) {
   if (!VD)
@@ -12276,20 +12295,24 @@
     }
     llvm::Value *ThreadID = getThreadID(CGF, CVD->getBeginLoc());
     const auto *AA = CVD->getAttr<OMPAllocateDeclAttr>();
-    assert(AA->getAllocator() &&
-           "Expected allocator expression for non-default allocator.");
-    llvm::Value *Allocator = CGF.EmitScalarExpr(AA->getAllocator());
-    // According to the standard, the original allocator type is a enum
-    // (integer). Convert to pointer type, if required.
-    Allocator = CGF.EmitScalarConversion(
-        Allocator, AA->getAllocator()->getType(), CGF.getContext().VoidPtrTy,
-        AA->getAllocator()->getExprLoc());
-    llvm::Value *Args[] = {ThreadID, Size, Allocator};
-
-    llvm::Value *Addr =
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                CGM.getModule(), OMPRTL___kmpc_alloc),
-                            Args, getName({CVD->getName(), ".void.addr"}));
+    const Expr *Allocator = AA->getAllocator();
+    llvm::Value *AllocVal = getAllocatorVal(CGF, Allocator);
+    llvm::Value *Alignment =
+        AA->getAlignment()
+            ? CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(AA->getAlignment()),
+                                        CGM.SizeTy, /*isSigned=*/false)
+            : nullptr;
+    SmallVector<llvm::Value *, 4> Args;
+    Args.push_back(ThreadID);
+    if (Alignment)
+      Args.push_back(Alignment);
+    Args.push_back(Size);
+    Args.push_back(AllocVal);
+    llvm::omp::RuntimeFunction FnID =
+        Alignment ? OMPRTL___kmpc_aligned_alloc : OMPRTL___kmpc_alloc;
+    llvm::Value *Addr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), FnID), Args,
+        getName({CVD->getName(), ".void.addr"}));
     llvm::FunctionCallee FiniRTLFn = OMPBuilder.getOrCreateRuntimeFunction(
         CGM.getModule(), OMPRTL___kmpc_free);
     QualType Ty = CGM.getContext().getPointerType(CVD->getType());
@@ -12303,14 +12326,14 @@
       llvm::FunctionCallee RTLFn;
       SourceLocation::UIntTy LocEncoding;
       Address Addr;
-      const Expr *Allocator;
+      const Expr *AllocExpr;
 
     public:
       OMPAllocateCleanupTy(llvm::FunctionCallee RTLFn,
                            SourceLocation::UIntTy LocEncoding, Address Addr,
-                           const Expr *Allocator)
+                           const Expr *AllocExpr)
           : RTLFn(RTLFn), LocEncoding(LocEncoding), Addr(Addr),
-            Allocator(Allocator) {}
+            AllocExpr(AllocExpr) {}
       void Emit(CodeGenFunction &CGF, Flags /*flags*/) override {
         if (!CGF.HaveInsertPoint())
           return;
@@ -12319,14 +12342,8 @@
             CGF, SourceLocation::getFromRawEncoding(LocEncoding));
         Args[1] = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
             Addr.getPointer(), CGF.VoidPtrTy);
-        llvm::Value *AllocVal = CGF.EmitScalarExpr(Allocator);
-        // According to the standard, the original allocator type is a enum
-        // (integer). Convert to pointer type, if required.
-        AllocVal = CGF.EmitScalarConversion(AllocVal, Allocator->getType(),
-                                            CGF.getContext().VoidPtrTy,
-                                            Allocator->getExprLoc());
+        llvm::Value *AllocVal = getAllocatorVal(CGF, AllocExpr);
         Args[2] = AllocVal;
-
         CGF.EmitRuntimeCall(RTLFn, Args);
       }
     };
@@ -12334,7 +12351,7 @@
         UntiedRealAddr.isValid() ? UntiedRealAddr : Address(Addr, Align);
     CGF.EHStack.pushCleanup<OMPAllocateCleanupTy>(
         NormalAndEHCleanup, FiniRTLFn, CVD->getLocation().getRawEncoding(),
-        VDAddr, AA->getAllocator());
+        VDAddr, Allocator);
     if (UntiedRealAddr.isValid())
       if (auto *Region =
               dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
Index: clang/test/OpenMP/align_clause_codegen.cpp
===================================================================
--- /dev/null
+++ clang/test/OpenMP/align_clause_codegen.cpp
@@ -0,0 +1,303 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
+// RUN: %clang_cc1 -emit-llvm -o - -fopenmp \
+// RUN: -triple i386-unknown-unknown -fopenmp-version=51 %s | \
+// RUN: FileCheck %s --check-prefix=CHECK-32
+// RUN: %clang_cc1 -emit-llvm -o - -fopenmp \
+// RUN: -triple x86_64-unknown-linux-gnu -fopenmp-version=51 %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp \
+// RUN: -triple x86_64-unknown-linux-gnu -fopenmp-version=51 \
+// RUN: -emit-pch %s -o %t
+// RUN: %clang_cc1 -fopenmp \
+// RUN: -triple x86_64-unknown-linux-gnu -fopenmp-version=51 \
+// RUN: -include-pch %t -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
+} omp_allocator_handle_t;
+
+int main() {
+  int foo0[5];
+  int foo1[10];
+  int foo2[20];
+  int foo3[30];
+  int foo4[40];
+  int foo5[50];
+  int foo6[60];
+  int foo7[70];
+  int foo8[80];
+  omp_allocator_handle_t MyAlloc = omp_large_cap_mem_alloc;
+
+#pragma omp allocate(foo0) align(1)
+#pragma omp allocate(foo1) allocator(omp_pteam_mem_alloc) align(2)
+#pragma omp allocate(foo2) align(4) allocator(omp_cgroup_mem_alloc)
+#pragma omp allocate(foo3) align(8) allocator(omp_low_lat_mem_alloc)
+#pragma omp allocate(foo4) align(16) allocator(omp_high_bw_mem_alloc)
+#pragma omp allocate(foo5) align(32) allocator(omp_const_mem_alloc)
+#pragma omp allocate(foo6) align(64) allocator(omp_large_cap_mem_alloc)
+#pragma omp allocate(foo7) align(32) allocator(omp_thread_mem_alloc)
+#pragma omp allocate(foo8) align(16) allocator(omp_null_allocator)
+  {
+    double foo9[80];
+    double foo10[90];
+#pragma omp allocate(foo9) align(8) allocator(omp_thread_mem_alloc)
+#pragma omp allocate(foo10) align(128)
+  }
+  {
+    int bar1;
+    int bar2[10];
+    int bar3[20];
+    int *bar4;
+    float bar5;
+    double bar6[30];
+#pragma omp allocate(bar1, bar2, bar3) align(2) allocator(MyAlloc)
+#pragma omp allocate(bar4, bar5, bar6) align(16)
+  }
+}
+
+// Verify align clause in template with non-type template parameter.
+template <typename T, unsigned size, unsigned align>
+T run() {
+  T foo[size];
+#pragma omp allocate(foo) align(align) allocator(omp_cgroup_mem_alloc)
+  return foo[0];
+}
+
+int template_test() {
+  double result;
+  result = run<double, 1000, 16>();
+  return 0;
+}
+#endif
+// CHECK-32-LABEL: define {{[^@]+}}@main
+// CHECK-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[MYALLOC:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK-32-NEXT:    [[DOTFOO0__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 1, i32 20, i8* null)
+// CHECK-32-NEXT:    [[DOTFOO0__ADDR:%.*]] = bitcast i8* [[DOTFOO0__VOID_ADDR]] to [5 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO1__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 2, i32 40, i8* inttoptr (i32 7 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO1__ADDR:%.*]] = bitcast i8* [[DOTFOO1__VOID_ADDR]] to [10 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO2__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 4, i32 80, i8* inttoptr (i32 6 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO2__ADDR:%.*]] = bitcast i8* [[DOTFOO2__VOID_ADDR]] to [20 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO3__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 8, i32 120, i8* inttoptr (i32 5 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO3__ADDR:%.*]] = bitcast i8* [[DOTFOO3__VOID_ADDR]] to [30 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO4__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 16, i32 160, i8* inttoptr (i32 4 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO4__ADDR:%.*]] = bitcast i8* [[DOTFOO4__VOID_ADDR]] to [40 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO5__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 32, i32 200, i8* inttoptr (i32 3 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO5__ADDR:%.*]] = bitcast i8* [[DOTFOO5__VOID_ADDR]] to [50 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO6__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 64, i32 240, i8* inttoptr (i32 2 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO6__ADDR:%.*]] = bitcast i8* [[DOTFOO6__VOID_ADDR]] to [60 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO7__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 32, i32 280, i8* inttoptr (i32 8 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO7__ADDR:%.*]] = bitcast i8* [[DOTFOO7__VOID_ADDR]] to [70 x i32]*
+// CHECK-32-NEXT:    [[DOTFOO8__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 16, i32 320, i8* null)
+// CHECK-32-NEXT:    [[DOTFOO8__ADDR:%.*]] = bitcast i8* [[DOTFOO8__VOID_ADDR]] to [80 x i32]*
+// CHECK-32-NEXT:    store i32 2, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[DOTFOO9__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 8, i32 640, i8* inttoptr (i32 8 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO9__ADDR:%.*]] = bitcast i8* [[DOTFOO9__VOID_ADDR]] to [80 x double]*
+// CHECK-32-NEXT:    [[DOTFOO10__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 128, i32 720, i8* null)
+// CHECK-32-NEXT:    [[DOTFOO10__ADDR:%.*]] = bitcast i8* [[DOTFOO10__VOID_ADDR]] to [90 x double]*
+// CHECK-32-NEXT:    [[TMP1:%.*]] = bitcast [90 x double]* [[DOTFOO10__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP1]], i8* null)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [80 x double]* [[DOTFOO9__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP2]], i8* inttoptr (i32 8 to i8*))
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[CONV:%.*]] = inttoptr i32 [[TMP3]] to i8*
+// CHECK-32-NEXT:    [[DOTBAR1__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 2, i32 4, i8* [[CONV]])
+// CHECK-32-NEXT:    [[DOTBAR1__ADDR:%.*]] = bitcast i8* [[DOTBAR1__VOID_ADDR]] to i32*
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[CONV1:%.*]] = inttoptr i32 [[TMP4]] to i8*
+// CHECK-32-NEXT:    [[DOTBAR2__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 2, i32 40, i8* [[CONV1]])
+// CHECK-32-NEXT:    [[DOTBAR2__ADDR:%.*]] = bitcast i8* [[DOTBAR2__VOID_ADDR]] to [10 x i32]*
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[CONV2:%.*]] = inttoptr i32 [[TMP5]] to i8*
+// CHECK-32-NEXT:    [[DOTBAR3__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 2, i32 80, i8* [[CONV2]])
+// CHECK-32-NEXT:    [[DOTBAR3__ADDR:%.*]] = bitcast i8* [[DOTBAR3__VOID_ADDR]] to [20 x i32]*
+// CHECK-32-NEXT:    [[DOTBAR4__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 16, i32 4, i8* null)
+// CHECK-32-NEXT:    [[DOTBAR4__ADDR:%.*]] = bitcast i8* [[DOTBAR4__VOID_ADDR]] to i32**
+// CHECK-32-NEXT:    [[DOTBAR5__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 16, i32 4, i8* null)
+// CHECK-32-NEXT:    [[DOTBAR5__ADDR:%.*]] = bitcast i8* [[DOTBAR5__VOID_ADDR]] to float*
+// CHECK-32-NEXT:    [[DOTBAR6__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 16, i32 240, i8* null)
+// CHECK-32-NEXT:    [[DOTBAR6__ADDR:%.*]] = bitcast i8* [[DOTBAR6__VOID_ADDR]] to [30 x double]*
+// CHECK-32-NEXT:    [[TMP6:%.*]] = bitcast [30 x double]* [[DOTBAR6__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP6]], i8* null)
+// CHECK-32-NEXT:    [[TMP7:%.*]] = bitcast float* [[DOTBAR5__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP7]], i8* null)
+// CHECK-32-NEXT:    [[TMP8:%.*]] = bitcast i32** [[DOTBAR4__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP8]], i8* null)
+// CHECK-32-NEXT:    [[TMP9:%.*]] = bitcast [20 x i32]* [[DOTBAR3__ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[CONV3:%.*]] = inttoptr i32 [[TMP10]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP9]], i8* [[CONV3]])
+// CHECK-32-NEXT:    [[TMP11:%.*]] = bitcast [10 x i32]* [[DOTBAR2__ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[CONV4:%.*]] = inttoptr i32 [[TMP12]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP11]], i8* [[CONV4]])
+// CHECK-32-NEXT:    [[TMP13:%.*]] = bitcast i32* [[DOTBAR1__ADDR]] to i8*
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[MYALLOC]], align 4
+// CHECK-32-NEXT:    [[CONV5:%.*]] = inttoptr i32 [[TMP14]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP13]], i8* [[CONV5]])
+// CHECK-32-NEXT:    [[TMP15:%.*]] = bitcast [80 x i32]* [[DOTFOO8__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP15]], i8* null)
+// CHECK-32-NEXT:    [[TMP16:%.*]] = bitcast [70 x i32]* [[DOTFOO7__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP16]], i8* inttoptr (i32 8 to i8*))
+// CHECK-32-NEXT:    [[TMP17:%.*]] = bitcast [60 x i32]* [[DOTFOO6__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP17]], i8* inttoptr (i32 2 to i8*))
+// CHECK-32-NEXT:    [[TMP18:%.*]] = bitcast [50 x i32]* [[DOTFOO5__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP18]], i8* inttoptr (i32 3 to i8*))
+// CHECK-32-NEXT:    [[TMP19:%.*]] = bitcast [40 x i32]* [[DOTFOO4__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP19]], i8* inttoptr (i32 4 to i8*))
+// CHECK-32-NEXT:    [[TMP20:%.*]] = bitcast [30 x i32]* [[DOTFOO3__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP20]], i8* inttoptr (i32 5 to i8*))
+// CHECK-32-NEXT:    [[TMP21:%.*]] = bitcast [20 x i32]* [[DOTFOO2__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP21]], i8* inttoptr (i32 6 to i8*))
+// CHECK-32-NEXT:    [[TMP22:%.*]] = bitcast [10 x i32]* [[DOTFOO1__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP22]], i8* inttoptr (i32 7 to i8*))
+// CHECK-32-NEXT:    [[TMP23:%.*]] = bitcast [5 x i32]* [[DOTFOO0__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP23]], i8* null)
+// CHECK-32-NEXT:    ret i32 0
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_Z13template_testv
+// CHECK-32-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[RESULT:%.*]] = alloca double, align 8
+// CHECK-32-NEXT:    [[CALL:%.*]] = call double @_Z3runIdLj1000ELj16EET_v()
+// CHECK-32-NEXT:    store double [[CALL]], double* [[RESULT]], align 8
+// CHECK-32-NEXT:    ret i32 0
+//
+//
+// CHECK-32-LABEL: define {{[^@]+}}@_Z3runIdLj1000ELj16EET_v
+// CHECK-32-SAME: () #[[ATTR2]] comdat {
+// CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-32-NEXT:    [[DOTFOO__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i32 16, i32 8000, i8* inttoptr (i32 6 to i8*))
+// CHECK-32-NEXT:    [[DOTFOO__ADDR:%.*]] = bitcast i8* [[DOTFOO__VOID_ADDR]] to [1000 x double]*
+// CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x double], [1000 x double]* [[DOTFOO__ADDR]], i32 0, i32 0
+// CHECK-32-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 8
+// CHECK-32-NEXT:    [[TMP2:%.*]] = bitcast [1000 x double]* [[DOTFOO__ADDR]] to i8*
+// CHECK-32-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP2]], i8* inttoptr (i32 6 to i8*))
+// CHECK-32-NEXT:    ret double [[TMP1]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[MYALLOC:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    [[DOTFOO0__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 1, i64 32, i8* null)
+// CHECK-NEXT:    [[DOTFOO0__ADDR:%.*]] = bitcast i8* [[DOTFOO0__VOID_ADDR]] to [5 x i32]*
+// CHECK-NEXT:    [[DOTFOO1__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 2, i64 48, i8* inttoptr (i64 7 to i8*))
+// CHECK-NEXT:    [[DOTFOO1__ADDR:%.*]] = bitcast i8* [[DOTFOO1__VOID_ADDR]] to [10 x i32]*
+// CHECK-NEXT:    [[DOTFOO2__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 4, i64 80, i8* inttoptr (i64 6 to i8*))
+// CHECK-NEXT:    [[DOTFOO2__ADDR:%.*]] = bitcast i8* [[DOTFOO2__VOID_ADDR]] to [20 x i32]*
+// CHECK-NEXT:    [[DOTFOO3__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 128, i8* inttoptr (i64 5 to i8*))
+// CHECK-NEXT:    [[DOTFOO3__ADDR:%.*]] = bitcast i8* [[DOTFOO3__VOID_ADDR]] to [30 x i32]*
+// CHECK-NEXT:    [[DOTFOO4__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 16, i64 160, i8* inttoptr (i64 4 to i8*))
+// CHECK-NEXT:    [[DOTFOO4__ADDR:%.*]] = bitcast i8* [[DOTFOO4__VOID_ADDR]] to [40 x i32]*
+// CHECK-NEXT:    [[DOTFOO5__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 32, i64 208, i8* inttoptr (i64 3 to i8*))
+// CHECK-NEXT:    [[DOTFOO5__ADDR:%.*]] = bitcast i8* [[DOTFOO5__VOID_ADDR]] to [50 x i32]*
+// CHECK-NEXT:    [[DOTFOO6__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 64, i64 240, i8* inttoptr (i64 2 to i8*))
+// CHECK-NEXT:    [[DOTFOO6__ADDR:%.*]] = bitcast i8* [[DOTFOO6__VOID_ADDR]] to [60 x i32]*
+// CHECK-NEXT:    [[DOTFOO7__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 32, i64 288, i8* inttoptr (i64 8 to i8*))
+// CHECK-NEXT:    [[DOTFOO7__ADDR:%.*]] = bitcast i8* [[DOTFOO7__VOID_ADDR]] to [70 x i32]*
+// CHECK-NEXT:    [[DOTFOO8__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 16, i64 320, i8* null)
+// CHECK-NEXT:    [[DOTFOO8__ADDR:%.*]] = bitcast i8* [[DOTFOO8__VOID_ADDR]] to [80 x i32]*
+// CHECK-NEXT:    store i64 2, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[DOTFOO9__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 8, i64 640, i8* inttoptr (i64 8 to i8*))
+// CHECK-NEXT:    [[DOTFOO9__ADDR:%.*]] = bitcast i8* [[DOTFOO9__VOID_ADDR]] to [80 x double]*
+// CHECK-NEXT:    [[DOTFOO10__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 128, i64 720, i8* null)
+// CHECK-NEXT:    [[DOTFOO10__ADDR:%.*]] = bitcast i8* [[DOTFOO10__VOID_ADDR]] to [90 x double]*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast [90 x double]* [[DOTFOO10__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP1]], i8* null)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast [80 x double]* [[DOTFOO9__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP2]], i8* inttoptr (i64 8 to i8*))
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[CONV:%.*]] = inttoptr i64 [[TMP3]] to i8*
+// CHECK-NEXT:    [[DOTBAR1__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 2, i64 4, i8* [[CONV]])
+// CHECK-NEXT:    [[DOTBAR1__ADDR:%.*]] = bitcast i8* [[DOTBAR1__VOID_ADDR]] to i32*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[CONV1:%.*]] = inttoptr i64 [[TMP4]] to i8*
+// CHECK-NEXT:    [[DOTBAR2__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 2, i64 48, i8* [[CONV1]])
+// CHECK-NEXT:    [[DOTBAR2__ADDR:%.*]] = bitcast i8* [[DOTBAR2__VOID_ADDR]] to [10 x i32]*
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[CONV2:%.*]] = inttoptr i64 [[TMP5]] to i8*
+// CHECK-NEXT:    [[DOTBAR3__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 2, i64 80, i8* [[CONV2]])
+// CHECK-NEXT:    [[DOTBAR3__ADDR:%.*]] = bitcast i8* [[DOTBAR3__VOID_ADDR]] to [20 x i32]*
+// CHECK-NEXT:    [[DOTBAR4__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 16, i64 8, i8* null)
+// CHECK-NEXT:    [[DOTBAR4__ADDR:%.*]] = bitcast i8* [[DOTBAR4__VOID_ADDR]] to i32**
+// CHECK-NEXT:    [[DOTBAR5__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 16, i64 4, i8* null)
+// CHECK-NEXT:    [[DOTBAR5__ADDR:%.*]] = bitcast i8* [[DOTBAR5__VOID_ADDR]] to float*
+// CHECK-NEXT:    [[DOTBAR6__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 16, i64 240, i8* null)
+// CHECK-NEXT:    [[DOTBAR6__ADDR:%.*]] = bitcast i8* [[DOTBAR6__VOID_ADDR]] to [30 x double]*
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast [30 x double]* [[DOTBAR6__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP6]], i8* null)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[DOTBAR5__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP7]], i8* null)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32** [[DOTBAR4__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP8]], i8* null)
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast [20 x i32]* [[DOTBAR3__ADDR]] to i8*
+// CHECK-NEXT:    [[TMP10:%.*]] = load i64, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[CONV3:%.*]] = inttoptr i64 [[TMP10]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP9]], i8* [[CONV3]])
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast [10 x i32]* [[DOTBAR2__ADDR]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = load i64, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[CONV4:%.*]] = inttoptr i64 [[TMP12]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP11]], i8* [[CONV4]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[DOTBAR1__ADDR]] to i8*
+// CHECK-NEXT:    [[TMP14:%.*]] = load i64, i64* [[MYALLOC]], align 8
+// CHECK-NEXT:    [[CONV5:%.*]] = inttoptr i64 [[TMP14]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP13]], i8* [[CONV5]])
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast [80 x i32]* [[DOTFOO8__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP15]], i8* null)
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast [70 x i32]* [[DOTFOO7__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP16]], i8* inttoptr (i64 8 to i8*))
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast [60 x i32]* [[DOTFOO6__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP17]], i8* inttoptr (i64 2 to i8*))
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast [50 x i32]* [[DOTFOO5__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP18]], i8* inttoptr (i64 3 to i8*))
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast [40 x i32]* [[DOTFOO4__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP19]], i8* inttoptr (i64 4 to i8*))
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast [30 x i32]* [[DOTFOO3__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP20]], i8* inttoptr (i64 5 to i8*))
+// CHECK-NEXT:    [[TMP21:%.*]] = bitcast [20 x i32]* [[DOTFOO2__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP21]], i8* inttoptr (i64 6 to i8*))
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast [10 x i32]* [[DOTFOO1__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP22]], i8* inttoptr (i64 7 to i8*))
+// CHECK-NEXT:    [[TMP23:%.*]] = bitcast [5 x i32]* [[DOTFOO0__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP23]], i8* null)
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z13template_testv
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RESULT:%.*]] = alloca double, align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call double @_Z3runIdLj1000ELj16EET_v()
+// CHECK-NEXT:    store double [[CALL]], double* [[RESULT]], align 8
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z3runIdLj1000ELj16EET_v
+// CHECK-SAME: () #[[ATTR2]] comdat {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK-NEXT:    [[DOTFOO__VOID_ADDR:%.*]] = call i8* @__kmpc_aligned_alloc(i32 [[TMP0]], i64 16, i64 8000, i8* inttoptr (i64 6 to i8*))
+// CHECK-NEXT:    [[DOTFOO__ADDR:%.*]] = bitcast i8* [[DOTFOO__VOID_ADDR]] to [1000 x double]*
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x double], [1000 x double]* [[DOTFOO__ADDR]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast [1000 x double]* [[DOTFOO__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP2]], i8* inttoptr (i64 6 to i8*))
+// CHECK-NEXT:    ret double [[TMP1]]
+//
Index: clang/test/OpenMP/allocate_codegen.cpp
===================================================================
--- clang/test/OpenMP/allocate_codegen.cpp
+++ clang/test/OpenMP/allocate_codegen.cpp
@@ -30,13 +30,13 @@
   KMP_ALLOCATOR_MAX_HANDLE = __UINTPTR_MAX__
 };
 
-struct St{
- int a;
+struct St {
+  int a;
 };
 
-struct St1{
- int a;
- static int b;
+struct St1 {
+  int a;
+  static int b;
 #pragma omp allocate(b) allocator(omp_default_mem_alloc)
 } d;
 
@@ -48,36 +48,49 @@
 template <class T>
 struct ST {
   static T m;
-  #pragma omp allocate(m) allocator(omp_low_lat_mem_alloc)
+#pragma omp allocate(m) allocator(omp_low_lat_mem_alloc)
 };
 
 template <class T> T foo() {
   T v;
-  #pragma omp allocate(v) allocator(omp_cgroup_mem_alloc)
+#pragma omp allocate(v) allocator(omp_cgroup_mem_alloc)
   v = ST<T>::m;
   return v;
 }
 
-namespace ns{
-  int a;
+namespace ns {
+int a;
 }
 #pragma omp allocate(ns::a) allocator(omp_pteam_mem_alloc)
 
 // CHECK-NOT:  call {{.+}} {{__kmpc_alloc|__kmpc_free}}
 
-// CHECK-LABEL: @main
-int main () {
+int main() {
   static int a;
 #pragma omp allocate(a) allocator(omp_thread_mem_alloc)
-  a=2;
-  // CHECK-NOT:  {{__kmpc_alloc|__kmpc_free}}
-  // CHECK:      alloca double,
-  // CHECK-NOT:  {{__kmpc_alloc|__kmpc_free}}
+  a = 2;
   double b = 3;
 #pragma omp allocate(b)
   return (foo<int>());
 }
 
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    store i32 2, i32* @_ZZ4mainE1a, align 4
+// CHECK-NEXT:    [[DOTB__VOID_ADDR:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP0]], i64 8, i8* null)
+// CHECK-NEXT:    [[DOTB__ADDR:%.*]] = bitcast i8* [[DOTB__VOID_ADDR]] to double*
+// CHECK-NEXT:    store double 3.000000e+00, double* [[DOTB__ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooIiET_v()
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[DOTB__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP1]], i8* null)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP2]]
+
 // CHECK: define {{.*}}i32 @{{.+}}foo{{.+}}()
 // CHECK:      [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK-NEXT: [[V_VOID_ADDR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 4, i8* inttoptr (i64 6 to i8*))
@@ -101,11 +114,11 @@
 // CHECK: [[Z_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 8, i8* inttoptr (i64 1 to i8*))
 // CHECK: [[Z_ADDR:%.+]] = bitcast i8* [[Z_VOID_PTR]] to float**
 // CHECK: store float* %{{.+}}, float** [[Z_ADDR]],
-#pragma omp allocate(a,z) allocator(omp_default_mem_alloc)
-// CHECK-NEXT: [[Z_VOID_PTR:%.+]] = bitcast float** [[Z_ADDR]] to i8*
-// CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[Z_VOID_PTR]], i8* inttoptr (i64 1 to i8*))
-// CHECK-NEXT: [[A_VOID_PTR:%.+]] = bitcast i32* [[A_ADDR]] to i8*
-// CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[A_VOID_PTR]], i8* inttoptr (i64 1 to i8*))
-// CHECK: ret void
+#pragma omp allocate(a, z) allocator(omp_default_mem_alloc)
+  // CHECK-NEXT: [[Z_VOID_PTR:%.+]] = bitcast float** [[Z_ADDR]] to i8*
+  // CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[Z_VOID_PTR]], i8* inttoptr (i64 1 to i8*))
+  // CHECK-NEXT: [[A_VOID_PTR:%.+]] = bitcast i32* [[A_ADDR]] to i8*
+  // CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[A_VOID_PTR]], i8* inttoptr (i64 1 to i8*))
+  // CHECK: ret void
 }
 #endif
Index: clang/test/OpenMP/allocate_codegen_attr.cpp
===================================================================
--- clang/test/OpenMP/allocate_codegen_attr.cpp
+++ clang/test/OpenMP/allocate_codegen_attr.cpp
@@ -65,19 +65,32 @@
 
 // CHECK-NOT:  call {{.+}} {{__kmpc_alloc|__kmpc_free}}
 
-// CHECK-LABEL: @main
 int main () {
   static int a;
   [[omp::directive(allocate(a) allocator(omp_thread_mem_alloc))]];
   a=2;
-  // CHECK-NOT:  {{__kmpc_alloc|__kmpc_free}}
-  // CHECK:      alloca double,
-  // CHECK-NOT:  {{__kmpc_alloc|__kmpc_free}}
   double b = 3;
   [[omp::directive(allocate(b))]];
   return (foo<int>());
 }
 
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    store i32 0, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    store i32 2, i32* @_ZZ4mainE1a, align 4
+// CHECK-NEXT:    [[DOTB__VOID_ADDR:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP0]], i64 8, i8* null)
+// CHECK-NEXT:    [[DOTB__ADDR:%.*]] = bitcast i8* [[DOTB__VOID_ADDR]] to double*
+// CHECK-NEXT:    store double 3.000000e+00, double* [[DOTB__ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooIiET_v()
+// CHECK-NEXT:    store i32 [[CALL]], i32* [[RETVAL]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[DOTB__ADDR]] to i8*
+// CHECK-NEXT:    call void @__kmpc_free(i32 [[TMP0]], i8* [[TMP1]], i8* null)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP2]]
+
 // CHECK: define {{.*}}i32 @{{.+}}foo{{.+}}()
 // CHECK:      [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK-NEXT: [[V_VOID_ADDR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], i64 4, i8* inttoptr (i64 6 to i8*))
Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -382,6 +382,8 @@
 __OMP_RTL(__kmpc_doacross_fini, false, Void, IdentPtr, Int32)
 
 __OMP_RTL(__kmpc_alloc, false, VoidPtr, /* Int */ Int32, SizeTy, VoidPtr)
+__OMP_RTL(__kmpc_aligned_alloc, false, VoidPtr, /* Int */ Int32, SizeTy, SizeTy,
+          VoidPtr)
 __OMP_RTL(__kmpc_free, false, Void, /* Int */ Int32, VoidPtr, VoidPtr)
 
 __OMP_RTL(__kmpc_init_allocator, false, /* omp_allocator_handle_t */ VoidPtr,
@@ -905,6 +907,8 @@
                 ParamAttrs(NoCaptureAttrs))
 
 __OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_aligned_alloc, DefaultAttrs, ReturnPtrAttrs,
+                ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), ParamAttrs())
 
 __OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs,