Index: lib/IR/Instructions.cpp
===================================================================
--- lib/IR/Instructions.cpp
+++ lib/IR/Instructions.cpp
@@ -2331,18 +2331,12 @@
       // Allowed, use first cast's opcode
       return firstOp;
     case 14:
-      // FIXME: this state can be merged with (2), but the following assert
-      // is useful to check the correcteness of the sequence due to semantic
-      // change of bitcast.
-      assert(
-        SrcTy->isPtrOrPtrVectorTy() &&
-        MidTy->isPtrOrPtrVectorTy() &&
-        DstTy->isPtrOrPtrVectorTy() &&
-        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
-        MidTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
-        "Illegal bitcast, addrspacecast sequence!");
-      // Allowed, use second cast's opcode
-      return secondOp;
+      // bitcast, addrspacecast -> addrspacecast if the element type of
+      // bitcast's source is the same as that of addrspacecast's destination.
+      if (SrcTy->getPointerElementType() == DstTy->getPointerElementType())
+        return Instruction::AddrSpaceCast;
+      return 0;
+
     case 15:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
Index: lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1434,7 +1434,12 @@
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
-    if (GEP->hasAllZeroIndices()) {
+    if (GEP->hasAllZeroIndices() &&
+        // If CI is an addrspacecast and GEP changes the poiner type, merging
+        // GEP into CI would undo canonicalizing addrspacecast with different
+        // pointer types, causing infinite loops.
+        (!isa<AddrSpaceCastInst>(CI) ||
+          GEP->getType() == GEP->getPointerOperand()->getType())) {
       // Changing the cast operand is usually not a good idea but it is safe
       // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
@@ -1904,5 +1909,24 @@
 }
 
 Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  // If the destination pointer element type is not the the same as the source's
+  // do the addrspacecast to the same type, and then the bitcast in the new
+  // address space. This allows the cast to be exposed to other transforms.
+  Value *Src = CI.getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
+  PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
+
+  Type *DestElemTy = DestTy->getElementType();
+  if (SrcTy->getElementType() != DestElemTy) {
+    Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
+    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+      MidTy = VectorType::get(MidTy, CI.getType()->getVectorNumElements());
+
+    // Putting bitcast before addrspacecast helps NVPTXFavorNonGenericAddrSpaces
+    // to fold the addrspacecast into load/store.
+    Value *NewBitCast = Builder->CreateBitCast(Src, MidTy);
+    return new AddrSpaceCastInst(NewBitCast, CI.getType());
+  }
+
   return commonPointerCastTransforms(CI);
 }
Index: test/Transforms/InstCombine/addrspacecast.ll
===================================================================
--- test/Transforms/InstCombine/addrspacecast.ll
+++ test/Transforms/InstCombine/addrspacecast.ll
@@ -28,13 +28,91 @@
 
 define float* @combine_redundant_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
 ; CHECK-LABEL: @combine_redundant_addrspacecast_types(
-; CHECK: addrspacecast i32 addrspace(1)* %x to float*
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float*
 ; CHECK-NEXT: ret
   %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(3)*
   %z = addrspacecast i32 addrspace(3)* %y to float*
   ret float* %z
 }
 
+define <4 x float*> @combine_redundant_addrspacecast_types_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_redundant_addrspacecast_types_vector(
+; CHECK-NEXT: bitcast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(1)*>
+; CHECK-NEXT: addrspacecast <4 x float addrspace(1)*> %1 to <4 x float*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x i32 addrspace(3)*>
+  %z = addrspacecast <4 x i32 addrspace(3)*> %y to <4 x float*>
+  ret <4 x float*> %z
+}
+
+define float addrspace(2)* @combine_addrspacecast_bitcast_1(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_bitcast_1(
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+  %z = bitcast i32 addrspace(2)* %y to float addrspace(2)*
+  ret float addrspace(2)* %z
+}
+
+define i32 addrspace(2)* @combine_addrspacecast_bitcast_2(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_bitcast_2(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to float addrspace(2)*
+  %z = bitcast float addrspace(2)* %y to i32 addrspace(2)*
+  ret i32 addrspace(2)* %z
+}
+
+define i32 addrspace(2)* @combine_bitcast_addrspacecast_1(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_bitcast_addrspacecast_1(
+; CHECK: addrspacecast i32 addrspace(1)* %x to i32 addrspace(2)*
+; CHECK-NEXT: ret
+  %y = bitcast i32 addrspace(1)* %x to i8 addrspace(1)*
+  %z = addrspacecast i8 addrspace(1)* %y to i32 addrspace(2)*
+  ret i32 addrspace(2)* %z
+}
+
+define float addrspace(2)* @combine_bitcast_addrspacecast_2(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_bitcast_addrspacecast_2(
+; CHECK: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = bitcast i32 addrspace(1)* %x to i8 addrspace(1)*
+  %z = addrspacecast i8 addrspace(1)* %y to float addrspace(2)*
+  ret float addrspace(2)* %z
+}
+
+define float addrspace(2)* @combine_addrspacecast_types(i32 addrspace(1)* %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_types(
+; CHECK-NEXT: bitcast i32 addrspace(1)* %x to float addrspace(1)*
+; CHECK-NEXT: addrspacecast float addrspace(1)* %1 to float addrspace(2)*
+; CHECK-NEXT: ret
+  %y = addrspacecast i32 addrspace(1)* %x to float addrspace(2)*
+  ret float addrspace(2)* %y
+}
+
+define <4 x float addrspace(2)*> @combine_addrspacecast_types_vector(<4 x i32 addrspace(1)*> %x) nounwind {
+; CHECK-LABEL: @combine_addrspacecast_types_vector(
+; CHECK-NEXT: bitcast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(1)*>
+; CHECK-NEXT: addrspacecast <4 x float addrspace(1)*> %1 to <4 x float addrspace(2)*>
+; CHECK-NEXT: ret
+  %y = addrspacecast <4 x i32 addrspace(1)*> %x to <4 x float addrspace(2)*>
+  ret <4 x float addrspace(2)*> %y
+}
+
+define i32 @canonicalize_addrspacecast([16 x i32] addrspace(1)* %arr) {
+; CHECK-LABEL: @canonicalize_addrspacecast(
+; CHECK-NEXT: getelementptr inbounds [16 x i32] addrspace(1)* %arr, i32 0, i32 0
+; CHECK-NEXT: addrspacecast i32 addrspace(1)* %{{[a-zA-Z0-9]+}} to i32*
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+  %p = addrspacecast [16 x i32] addrspace(1)* %arr to i32*
+  %v = load i32* %p
+  ret i32 %v
+}
+
 @const_array = addrspace(2) constant [60 x i8] [i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
                                                 i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
                                                 i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22, i8 2, i8 9, i8 4, i8 22,
Index: test/Transforms/InstCombine/getelementptr.ll
===================================================================
--- test/Transforms/InstCombine/getelementptr.ll
+++ test/Transforms/InstCombine/getelementptr.ll
@@ -732,7 +732,8 @@
 define i64 @test_gep_bitcast_array_same_size_element_addrspacecast([100 x double]* %arr, i64 %N) {
 ; CHECK-LABEL: @test_gep_bitcast_array_same_size_element_addrspacecast(
 ; CHECK: getelementptr [100 x double]* %arr, i64 0, i64 %V
-; CHECK-NEXT: %t = addrspacecast double*
+; CHECK-NEXT: bitcast double*
+; CHECK-NEXT: %t = addrspacecast i64*
 ; CHECK: load i64 addrspace(3)* %t
   %cast = addrspacecast [100 x double]* %arr to i64 addrspace(3)*
   %V = mul i64 %N, 8
@@ -802,10 +803,22 @@
 ; CHECK-NEXT: ret i16 8
 }
 
-define i32 addrspace(1)* @ascast_0_gep([128 x i32]* %p) nounwind {
+define i32 addrspace(1)* @ascast_0_gep(i32* %p) nounwind {
 ; CHECK-LABEL: @ascast_0_gep(
 ; CHECK-NOT: getelementptr
 ; CHECK: ret
+  %gep = getelementptr i32* %p, i32 0
+  %x = addrspacecast i32* %gep to i32 addrspace(1)*
+  ret i32 addrspace(1)* %x
+}
+
+; Do not merge the GEP and the addrspacecast, because it would undo the
+; addrspacecast canonicalization.
+define i32 addrspace(1)* @ascast_0_0_gep([128 x i32]* %p) nounwind {
+; CHECK-LABEL: @ascast_0_0_gep(
+; CHECK-NEXT: getelementptr [128 x i32]
+; CHECK-NEXT: addrspacecast i32*
+; CHECK-NEXT: ret i32 addrspace(1)*
   %gep = getelementptr [128 x i32]* %p, i32 0, i32 0
   %x = addrspacecast i32* %gep to i32 addrspace(1)*
   ret i32 addrspace(1)* %x
Index: test/Transforms/InstCombine/memcpy-from-global.ll
===================================================================
--- test/Transforms/InstCombine/memcpy-from-global.ll
+++ test/Transforms/InstCombine/memcpy-from-global.ll
@@ -78,7 +78,8 @@
 ; %A alloca is deleted
 ; This doesn't exactly match what test2 does, because folding the type
 ; cast into the alloca doesn't work for the addrspacecast yet.
-; CHECK-NEXT: alloca %T
+; CHECK-NEXT: alloca [124 x i8]
+; CHECK-NEXT: getelementptr
 ; CHECK-NEXT: addrspacecast
 
 ; use @G instead of %A