diff --git a/clang/test/CodeGenCUDA/b52037.ll b/clang/test/CodeGenCUDA/b52037.ll
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCUDA/b52037.ll
@@ -0,0 +1,259 @@
+; Reproducer for a bad performance regression triggered by switch to the new PM.
+; `barney` ended up with the local variables not being optimized away and that
+; had rather dramatic effect on some GPU code. See
+; https://bugs.llvm.org/show_bug.cgi?id=52037 for the gory details.
+;
+; Even though the fix for it is in LLVM, the issue is hard to reproduce with
+; opt+llc. The pipeline created by LLVM tools is somewhat different compared to
+; the one created by clang and that's enough to affect the reproducer.
+;
+; RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -aux-triple x86_64-pc-linux-gnu -S -aux-target-cpu x86-64 \
+; RUN:  -fcuda-is-device -target-cpu sm_70 -Wno-override-module -O3 -x ir %s -flegacy-pass-manager -o - \
+; RUN: | FileCheck  %s
+; RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -aux-triple x86_64-pc-linux-gnu -S -aux-target-cpu x86-64 \
+; RUN:  -fcuda-is-device -target-cpu sm_70 -Wno-override-module -O3 -x ir %s -o - \
+; RUN: | FileCheck  %s
+
+; CHECK-LABEL: .visible .entry barney(
+; CHECK-NOT:  .local{{.*}}__local_depot
+; CHECK: ret;
+
+source_filename = "reduced.1.ll"
+
+%char3 = type { i8, i8, i8 }
+%float4 = type { float, float, float, float }
+%float3 = type { float, float, float }
+%int3 = type { i32, i32, i32 }
+%struct.wwwww = type { i32 (...)**, [8 x i8], i32, [12 x i8] }
+%struct.blam = type <{ i32*, i16*, %float4, %int3, i32, %float3, [4 x i8], i64, i32, i8, [3 x i8], i32, [12 x i8] }>
+%struct.spam.2 = type { %struct.foo.3, i16*, float, float, i32, float }
+%struct.foo.3 = type <{ %float4*, %float4*, %float4*, i32*, i32*, i32, i32, float }>
+%struct.zot = type { %struct.bar, [8 x i8], %struct.foo, [12 x i8] }
+%struct.bar = type { i32 (...)** }
+%struct.foo = type <{ i16*, %float4, %int3, i32, %float3, [4 x i8], i64, i32, i8, [3 x i8], i32 }>
+
+@global = external addrspace(4) externally_initialized global [27 x %char3], align 1
+@global.1 = linkonce_odr unnamed_addr constant { [3 x i8*] } { [3 x i8*] [i8* inttoptr (i64 16 to i8*), i8* null, i8* null] }, align 8
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0
+
+declare %float4 @snork(float)
+
+declare %float3 @bar.2(float, float)
+
+declare %float3 @zot()
+
+declare %int3 @hoge(i32, i32, i32)
+
+define internal i1 @pluto(%struct.wwwww* %arg) {
+bb:
+  %tmp = call i64 @foo()
+  %tmp1 = bitcast %struct.wwwww* %arg to i8**
+  %tmp2 = load i8*, i8** %tmp1, align 8, !tbaa !1
+  %tmp3 = getelementptr i8, i8* %tmp2, i64 -24
+  %tmp4 = bitcast i8* %tmp3 to i64*
+  %tmp5 = load i64, i64* %tmp4, align 8
+  %tmp6 = bitcast %struct.wwwww* %arg to i8*
+  %tmp7 = getelementptr inbounds i8, i8* %tmp6, i64 %tmp5
+  %tmp8 = bitcast i8* %tmp7 to %struct.blam*
+  %tmp9 = getelementptr inbounds %struct.blam, %struct.blam* %tmp8, i32 undef, i32 7
+  %tmp10 = load i64, i64* %tmp9, align 16
+  %tmp11 = add i64 %tmp10, %tmp
+  store i64 %tmp11, i64* %tmp9, align 16, !tbaa !4
+  %tmp12 = bitcast %struct.wwwww* %arg to i8**
+  %tmp13 = load i8*, i8** %tmp12, align 8, !tbaa !1
+  %tmp14 = getelementptr i8, i8* %tmp13, i64 -24
+  %tmp15 = bitcast i8* %tmp14 to i64*
+  %tmp16 = load i64, i64* %tmp15, align 8
+  %tmp17 = bitcast %struct.wwwww* %arg to i8*
+  %tmp18 = getelementptr inbounds i8, i8* %tmp17, i64 %tmp16
+  %tmp19 = bitcast i8* %tmp18 to %struct.blam*
+  %tmp20 = getelementptr inbounds %struct.blam, %struct.blam* %tmp19, i32 undef, i32 1
+  %tmp21 = load i16*, i16** %tmp20, align 8
+  %tmp22 = bitcast %struct.wwwww* %arg to i8**
+  %tmp23 = load i8*, i8** %tmp22, align 8
+  %tmp24 = getelementptr i8, i8* %tmp23, i64 -24
+  %tmp25 = bitcast i8* %tmp24 to i64*
+  %tmp26 = load i64, i64* %tmp25, align 8
+  %tmp27 = bitcast %struct.wwwww* %arg to i8*
+  %tmp28 = getelementptr inbounds i8, i8* %tmp27, i64 %tmp26
+  %tmp29 = bitcast i8* %tmp28 to %struct.blam*
+  %tmp30 = getelementptr inbounds %struct.blam, %struct.blam* %tmp29, i32 undef, i32 7
+  %tmp31 = load i64, i64* %tmp30, align 16
+  %tmp32 = bitcast %struct.wwwww* %arg to i8**
+  %tmp33 = load i8*, i8** %tmp32, align 8
+  %tmp34 = getelementptr i8, i8* %tmp33, i64 -24
+  %tmp35 = bitcast i8* %tmp34 to i64*
+  %tmp36 = load i64, i64* %tmp35, align 8
+  %tmp37 = bitcast %struct.wwwww* %arg to i8*
+  %tmp38 = getelementptr inbounds i8, i8* %tmp37, i64 %tmp36
+  %tmp39 = bitcast i8* %tmp38 to %struct.blam*
+  %tmp40 = getelementptr inbounds %struct.blam, %struct.blam* %tmp39, i32 undef, i32 4
+  %tmp41 = load i32, i32* %tmp40, align 4
+  %tmp42 = zext i32 %tmp41 to i64
+  %tmp43 = add i64 %tmp31, %tmp42
+  %tmp44 = getelementptr inbounds i16, i16* %tmp21, i64 %tmp43
+  %tmp45 = load i16, i16* %tmp44, align 2
+  %tmp46 = zext i16 %tmp45 to i32
+  %tmp47 = icmp eq i32 %tmp46, 65535
+  br i1 %tmp47, label %bb67, label %bb49
+
+bb49:                                             ; preds = %bb
+  %tmp50 = bitcast %struct.wwwww* %arg to i8**
+  %tmp51 = load i8*, i8** %tmp50, align 8
+  %tmp52 = getelementptr i8, i8* %tmp51, i64 -24
+  %tmp53 = bitcast i8* %tmp52 to i64*
+  %tmp54 = load i64, i64* %tmp53, align 8
+  %tmp55 = bitcast %struct.wwwww* %arg to i8*
+  %tmp56 = getelementptr inbounds i8, i8* %tmp55, i64 %tmp54
+  %tmp57 = bitcast i8* %tmp56 to %struct.blam*
+  %tmp58 = getelementptr inbounds %struct.blam, %struct.blam* %tmp57, i32 undef, i32 2
+  %tmp59 = getelementptr inbounds %struct.blam, %struct.blam* %tmp57, i32 undef, i32 5
+  %tmp60 = getelementptr inbounds %struct.blam, %struct.blam* %tmp57, i32 undef, i32 0
+  %tmp61 = load i32*, i32** %tmp60, align 16
+  %tmp62 = getelementptr inbounds %struct.blam, %struct.blam* %tmp57, i32 undef, i32 3
+  %tmp63 = getelementptr inbounds %struct.blam, %struct.blam* %tmp57, i32 undef, i32 9
+  %tmp64 = getelementptr inbounds %struct.blam, %struct.blam* %tmp57, i32 undef, i32 8
+  %tmp8.i = zext i16 %tmp45 to i32
+  %tmp9.i = ashr i32 %tmp8.i, 11
+  %tmp10.i = sub nsw i32 %tmp9.i, 1
+  %tmp11.i = trunc i32 %tmp10.i to i8
+  store i8 %tmp11.i, i8* %tmp63, align 1
+  %tmp12.i = bitcast %float4* %tmp58 to %float3*
+  %tmp13.i = call %float3 @zot() #1
+  %tmp15.i = extractvalue %float3 %tmp13.i, 0
+  %tmp18.i = getelementptr inbounds %float4, %float4* %tmp58, i32 undef, i32 0
+  %tmp19.i = load float, float* %tmp18.i, align 4
+  %tmp22.i = fsub contract float %tmp19.i, %tmp15.i
+  %tmp23.i = getelementptr inbounds %float3, %float3* %tmp12.i, i32 undef, i32 2
+  %tmp24.i = load float, float* %tmp23.i, align 4
+  %tmp17.i = extractvalue %float3 %tmp13.i, 2
+  %tmp27.i = fsub contract float %tmp24.i, %tmp17.i
+  %tmp28.i = call %float3 @bar.2(float %tmp22.i, float %tmp27.i) #1
+  store %float3 %tmp28.i, %float3* %tmp59, align 4
+  %tmp37.i = load i8, i8* %tmp63, align 1
+  %tmp38.i = zext i8 %tmp37.i to i64
+  %tmp39.i = getelementptr inbounds [27 x %char3], [27 x %char3]* addrspacecast ([27 x %char3] addrspace(4)* @global to [27 x %char3]*), i64 0, i64 %tmp38.i
+  %tmp40.i = getelementptr inbounds %int3, %int3* %tmp62, i32 undef, i32 0
+  %tmp41.i = load i32, i32* %tmp40.i, align 4
+  %tmp42.i = getelementptr inbounds %char3, %char3* %tmp39.i, i32 undef, i32 0
+  %tmp43.i = load i8, i8* %tmp42.i, align 1
+  %tmp44.i = sext i8 %tmp43.i to i32
+  %tmp45.i = add nsw i32 %tmp41.i, %tmp44.i
+  %tmp48.i = getelementptr inbounds %int3, %int3* %tmp62, i32 undef, i32 1
+  %tmp49.i = load i32, i32* %tmp48.i, align 4
+  %tmp50.i = getelementptr inbounds %char3, %char3* %tmp39.i, i32 undef, i32 1
+  %tmp51.i = load i8, i8* %tmp50.i, align 1
+  %tmp52.i = sext i8 %tmp51.i to i32
+  %tmp53.i = add nsw i32 %tmp49.i, %tmp52.i
+  %tmp54.i = getelementptr inbounds %int3, %int3* %tmp62, i32 undef, i32 2
+  %tmp55.i = load i32, i32* %tmp54.i, align 4
+  %tmp56.i = getelementptr inbounds %char3, %char3* %tmp39.i, i32 undef, i32 2
+  %tmp57.i = load i8, i8* %tmp56.i, align 1
+  %tmp58.i = sext i8 %tmp57.i to i32
+  %tmp59.i = add nsw i32 %tmp55.i, %tmp58.i
+  %tmp60.i = call %int3 @hoge(i32 %tmp45.i, i32 %tmp53.i, i32 %tmp59.i) #1
+  %tmp61.i = getelementptr inbounds i32, i32* %tmp61, i64 undef
+  %tmp62.i = load i32, i32* %tmp61.i, align 4
+  store i32 %tmp62.i, i32* %tmp64, align 4
+  br label %bb67
+
+bb67:                                             ; preds = %bb49, %bb
+  %tmp68 = phi i1 [ true, %bb49 ], [ false, %bb ]
+  ret i1 %tmp68
+}
+
+declare i64 @foo()
+
+define void @barney(%struct.spam.2* %arg) {
+bb:
+  call void asm sideeffect "// KEEP", ""()
+  %tmp = alloca %struct.zot, align 16
+  %tmp1 = alloca i32, i32 undef, align 4
+  %tmp2 = alloca %float4, i32 undef, align 16
+  br label %bb3
+
+bb3:                                              ; preds = %bb
+  %tmp4 = getelementptr inbounds %struct.spam.2, %struct.spam.2* %arg, i32 undef, i32 1
+  %tmp5 = load i16*, i16** %tmp4, align 8
+  %tmp6 = bitcast %struct.zot* %tmp to i8*
+  %tmp7 = getelementptr inbounds i8, i8* %tmp6, i64 16
+  %tmp8 = bitcast i8* %tmp7 to %struct.blam*
+  %tmp9 = getelementptr inbounds %struct.blam, %struct.blam* %tmp8, i32 undef, i32 1
+  store i16* %tmp5, i16** %tmp9, align 8
+  %tmp10 = bitcast %struct.zot* %tmp to i32 (...)***
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @global.1, i32 0, inrange i32 0, i32 3) to i32 (...)**), i32 (...)*** %tmp10, align 8
+  br label %bb11
+
+bb11:                                             ; preds = %bb49, %bb3
+  %tmp12 = bitcast %struct.zot* %tmp to %struct.wwwww*
+  %tmp13 = call i1 @pluto(%struct.wwwww* %tmp12)
+  br i1 %tmp13, label %bb15, label %bb14
+
+bb14:                                             ; preds = %bb11
+  ret void
+
+bb15:                                             ; preds = %bb11
+  %tmp16 = bitcast %struct.zot* %tmp to i8**
+  %tmp17 = load i8*, i8** %tmp16, align 16
+  %tmp18 = getelementptr i8, i8* %tmp17, i64 -24
+  %tmp19 = bitcast i8* %tmp18 to i64*
+  %tmp20 = load i64, i64* %tmp19, align 8
+  %tmp21 = bitcast %struct.zot* %tmp to i8*
+  %tmp22 = getelementptr inbounds i8, i8* %tmp21, i64 %tmp20
+  %tmp23 = bitcast i8* %tmp22 to %struct.blam*
+  %tmp24 = getelementptr inbounds %struct.blam, %struct.blam* %tmp23, i32 undef, i32 11
+  %tmp25 = load i32, i32* %tmp24, align 4
+  store i32 %tmp25, i32* %tmp1, align 4
+  %tmp26 = bitcast %struct.zot* %tmp to i8**
+  %tmp27 = load i8*, i8** %tmp26, align 16
+  %tmp28 = getelementptr i8, i8* %tmp27, i64 -24
+  %tmp29 = bitcast i8* %tmp28 to i64*
+  %tmp30 = load i64, i64* %tmp29, align 8
+  %tmp31 = bitcast %struct.zot* %tmp to i8*
+  %tmp32 = getelementptr inbounds i8, i8* %tmp31, i64 %tmp30
+  %tmp33 = bitcast i8* %tmp32 to %struct.blam*
+  %tmp34 = bitcast %struct.spam.2* %arg to %float4**
+  %tmp35 = load i32, i32* %tmp1, align 4
+  %tmp36 = load %float4*, %float4** %tmp34, align 8
+  %tmp37 = zext i32 %tmp35 to i64
+  %tmp38 = getelementptr inbounds %float4, %float4* %tmp36, i64 %tmp37
+  %tmp39 = bitcast %float4* %tmp38 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* %tmp39, i64 undef, i1 false)
+  %tmp40 = getelementptr inbounds %struct.blam, %struct.blam* %tmp33, i32 undef, i32 5
+  %tmp41 = getelementptr inbounds %float3, %float3* %tmp40, i32 undef, i32 2
+  %tmp42 = load float, float* %tmp41, align 4
+  %tmp43 = getelementptr inbounds %float4, %float4* %tmp2, i32 undef, i32 2
+  %tmp44 = load float, float* %tmp43, align 8
+  %tmp45 = fsub contract float %tmp42, %tmp44
+  %tmp46 = call %float4 @snork(float %tmp45)
+  br label %bb49
+
+
+bb49:                                             ; preds = %bb15
+  br label %bb11
+
+bb50:                                             ; preds = %bb49
+  unreachable
+}
+
+attributes #0 = { argmemonly nofree nounwind willreturn }
+attributes #1 = { nounwind }
+
+!nvvm.annotations = !{!0}
+
+!0 = !{void (%struct.spam.2*)* @barney, !"kernel", i32 1}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"vtable pointer", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}
+!4 = !{!5, !13, i64 64}
+!5 = !{!"_ZTSN7cuneibs22neiblist_iterator_coreE", !6, i64 0, !6, i64 8, !8, i64 16, !10, i64 32, !11, i64 44, !12, i64 48, !13, i64 64, !11, i64 72, !7, i64 76, !11, i64 80}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !3, i64 0}
+!8 = !{!"_ZTS6float4", !9, i64 0, !9, i64 4, !9, i64 8, !9, i64 12}
+!9 = !{!"float", !7, i64 0}
+!10 = !{!"_ZTS4int3", !11, i64 0, !11, i64 4, !11, i64 8}
+!11 = !{!"int", !7, i64 0}
+!12 = !{!"_ZTS6float3", !9, i64 0, !9, i64 4, !9, i64 8}
+!13 = !{!"long", !7, i64 0}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -328,6 +328,7 @@
     addEarlyCSEOrGVNPass();
     if (!DisableLoadStoreVectorizer)
       addPass(createLoadStoreVectorizerPass());
+    addPass(createSROAPass());
   }
 }