Index: lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
+++ lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -8,35 +8,41 @@
 //===----------------------------------------------------------------------===//
 //
 // When a load/store accesses the generic address space, checks whether the
-// address is casted from a non-generic address space. If so, remove this
-// addrspacecast because accessing non-generic address spaces is typically
-// faster. Besides seeking addrspacecasts, this optimization also traces into
-// the base pointer of a GEP.
+// address is casted from a non-generic address space. If so, avoid casting
+// the address space because accessing non-generic address spaces is typically
+// faster. If the address of the load/store is a GEP, we also look into the base
+// of the GEP for redundant addrspacecasts.
 //
 // For instance, the code below loads a float from an array allocated in
 // addrspace(3).
 //
-// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
-// %1 = gep [10 x float]* %0, i64 0, i64 %i
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [5 x float]*
+// %1 = gep [5 x float]* %0, i64 0, i64 %i
 // %2 = load float* %1 ; emits ld.f32
 //
-// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
-// and the GEP to expose more optimization opportunities to function
-// optimizeMemoryInst. The intermediate code looks like:
+// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast and the
+// GEP to expose more optimization opportunities to function optimizeMemoryInst.
+// Note that if the addrspacecast changes the underlying element type,  we need
+// to leave a bitcast where the addrspacecast was.
 //
-// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %1 = addrspacecast float addrspace(3)* %0 to float*
-// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+// The intermediate code from the above example looks like:
+//
+// %0 = bitcast [10 x float] addrspace(3)* @a to [5 x float] addrspace(3)*
+// %1 = gep [5 x float] addrspace(3)* %0, i64 0, i64 %i
+// %2 = addrspacecast float addrspace(3)* %1 to float*
+// %3 = load float* %2 ; still emits ld.f32, but will be optimized shortly
 //
 // Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
 // generic pointers, and folds the load and the addrspacecast into a load from
 // the original address space. The final code looks like:
 //
-// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+// %0 = bitcast [10 x float] addrspace(3)* @a to [5 x float] addrspace(3)*
+// %1 = gep [5 x float] addrspace(3)* %0, i64 0, i64 %i
+// %3 = load float addrspace(3)* %1 ; emits ld.shared.f32
 //
-// This pass may remove an addrspacecast in a different BB. Therefore, we
-// implement it as a FunctionPass.
+// The instructions that compute the address of a load/store may happen in a
+// different basic block from the load/store. Therefore, we implement this
+// transformation as a FunctionPass.
 //
 //===----------------------------------------------------------------------===//
 
@@ -68,10 +74,10 @@
   /// Optimizes load/store instructions. Idx is the index of the pointer operand
   /// (0 for load, and 1 for store). Returns true if it changes anything.
   bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
-  /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
-  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
-  /// optimization opportunities on loads and stores. Returns true if it changes
-  /// the program.
+  /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep
+  /// (bitcast X, indices)". Reording gep and addrspacecast exposes to
+  /// optimizeMemoryInstruction more optimization opportunities on loads and
+  /// stores. Returns true if it changes the program.
   bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
 };
 }
@@ -85,9 +91,16 @@
                 "Remove unnecessary non-generic-to-generic addrspacecasts",
                 false, false)
 
-// Decides whether removing Cast is valid and beneficial. Cast can be an
-// instruction or a constant expression.
-static bool IsEliminableAddrSpaceCast(Operator *Cast) {
+// Decides whether eliminating the addrspacecast is beneficial. If the source
+// and destination of the addrspacecast have different element types, the caller
+// will later demote the addrspacecast to a bitcast instead of completely
+// removing it. In that case, this function outputs the type of this
+// intermediate bitcast as IntermediateType.
+//
+// The parameter Cast can be either an AddressSpaceCastInst or an addrspacecast
+// ConstantExpr.
+static bool IsEliminableAddrSpaceCast(Operator *Cast,
+                                      PointerType *&IntermediateType) {
   // Returns false if not even an addrspacecast.
   if (Cast->getOpcode() != Instruction::AddrSpaceCast)
     return false;
@@ -95,12 +108,15 @@
   Value *Src = Cast->getOperand(0);
   PointerType *SrcTy = cast<PointerType>(Src->getType());
   PointerType *DestTy = cast<PointerType>(Cast->getType());
-  // TODO: For now, we only handle the case where the addrspacecast only changes
-  // the address space but not the type. If the type also changes, we could
-  // still get rid of the addrspacecast by adding an extra bitcast, but we
-  // rarely see such scenarios.
-  if (SrcTy->getElementType() != DestTy->getElementType())
-    return false;
+
+  if (SrcTy->getElementType() != DestTy->getElementType()) {
+    // addrspacecast can change not only the address space but also the
+    // underlying element type of the source pointer. If it converts the element
+    // type, we will later demote the addrspacecast to a bitcast and compute the
+    // type of this intermediate bitcast here as preparation.
+    IntermediateType =
+        PointerType::get(DestTy->getElementType(), SrcTy->getAddressSpace());
+  }
 
   // Checks whether the addrspacecast is from a non-generic address space to the
   // generic address space.
@@ -108,34 +124,39 @@
           DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
 }
 
-bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
-    GEPOperator *GEP) {
+bool
+NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(GEPOperator *GEP) {
   Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
   if (!Cast)
     return false;
 
-  if (!IsEliminableAddrSpaceCast(Cast))
+  PointerType *BitCastedType = nullptr;
+  if (!IsEliminableAddrSpaceCast(Cast, BitCastedType))
     return false;
 
   SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
   if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
+    Value *Src = Cast->getOperand(0);
     // %1 = gep (addrspacecast X), indices
     // =>
-    // %0 = gep X, indices
+    // %0 = gep (bitcast X), indices
     // %1 = addrspacecast %0
-    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0),
-                                                           Indices,
-                                                           GEP->getName(),
-                                                           GEPI);
+    if (BitCastedType != nullptr) {
+      Src = new BitCastInst(Src, BitCastedType, "", GEPI);
+    }
+    GetElementPtrInst *NewGEPI =
+        GetElementPtrInst::Create(Src, Indices, GEP->getName(), GEPI);
     NewGEPI->setIsInBounds(GEP->isInBounds());
     GEP->replaceAllUsesWith(
         new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
   } else {
-    // GEP is a constant expression.
-    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
-        cast<Constant>(Cast->getOperand(0)),
-        Indices,
-        GEP->isInBounds());
+    // GEP is a ConstantExpr, so its operand Cast is also a ConstantExpr.
+    Constant *Src = cast<ConstantExpr>(Cast)->getOperand(0);
+    if (BitCastedType != nullptr) {
+      Src = ConstantExpr::getBitCast(Src, BitCastedType);
+    }
+    Constant *NewGEPCE =
+        ConstantExpr::getGetElementPtr(Src, Indices, GEP->isInBounds());
     GEP->replaceAllUsesWith(
         ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
   }
@@ -151,19 +172,25 @@
     hoistAddrSpaceCastFromGEP(GEP);
   }
 
-  // load/store (addrspacecast X) => load/store X if shortcutting the
-  // addrspacecast is valid and can improve performance.
+  // load/store (addrspacecast X) => load/store (bitcast X) if using the
+  // original address space is faster.
   //
   // e.g.,
-  // %1 = addrspacecast float addrspace(3)* %0 to float*
-  // %2 = load float* %1
+  // %1 = addrspacecast float addrspace(3)* %0 to int*
+  // %2 = load int* %1
   // ->
-  // %2 = load float addrspace(3)* %0
+  // %1 = bitcast float addrspace(3)* %0 to int addrspace(3)*
+  // %2 = load int addrspace(3)* %1
   //
-  // Note: the addrspacecast can also be a constant expression.
+  // Note: the addrspacecast can also be a ConstantExpr.
   if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
-    if (IsEliminableAddrSpaceCast(Cast)) {
-      MI->setOperand(Idx, Cast->getOperand(0));
+    PointerType *BitCastedType = nullptr;
+    if (IsEliminableAddrSpaceCast(Cast, BitCastedType)) {
+      Value *Src = Cast->getOperand(0);
+      if (BitCastedType != nullptr) {
+        Src = new BitCastInst(Src, BitCastedType, "", MI);
+      }
+      MI->setOperand(Idx, Src);
       return true;
     }
   }
Index: test/CodeGen/NVPTX/access-non-generic.ll
===================================================================
--- test/CodeGen/NVPTX/access-non-generic.ll
+++ test/CodeGen/NVPTX/access-non-generic.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix PTX
 ; RUN: opt < %s -S -nvptx-favor-non-generic -dce | FileCheck %s --check-prefix IR
 
+%struct.float2 = type { float, float }
+
 @array = internal addrspace(3) global [10 x float] zeroinitializer, align 4
 @scalar = internal addrspace(3) global float 0.000000e+00, align 4
 
@@ -78,13 +80,36 @@
 ; different element types.
 define i32 @ld_int_from_float() {
 ; IR-LABEL: @ld_int_from_float
-; IR: addrspacecast
 ; PTX-LABEL: ld_int_from_float(
-; PTX: cvta.shared.u{{(32|64)}}
   %1 = load i32* addrspacecast(float addrspace(3)* @scalar to i32*), align 4
+; IR: [[ADDR:%[0-9]+]] = bitcast
+; IR-NEXT: load i32 addrspace(3)* [[ADDR]]
+; PTX: ld.shared.u32 {{%r[0-9]+}}, [scalar]
   ret i32 %1
 }
 
+define float @float_array_to_float2(i32 %i) {
+; IR-LABEL: @float_array_to_float2
+; PTX-LABEL: float_array_to_float2
+; addrspacecast; gep; load; => bitcast; gep; load
+  %p = addrspacecast [10 x float] addrspace(3)* @array to %struct.float2*
+  %p2 = getelementptr inbounds %struct.float2* %p, i32 0, i32 1
+  %1 = load float* %p2, align 4
+; IR: bitcast [10 x float] addrspace(3)* @array to %struct.float2 addrspace(3)*
+; IR-NEXT: getelementptr inbounds %struct.float2 addrspace(3)*
+; IR-NEXT: load float addrspace(3)*
+; PTX: ld.shared.f32 {{%f[0-9]+}}, [array+4]
+
+; load (gep (addrspacecast)) => load (gep (bitcast))
+  %2 = load float* getelementptr inbounds (%struct.float2* addrspacecast ([10 x float] addrspace(3)* @array to %struct.float2*), i32 0, i32 0), align 4
+; IR: bitcast [10 x float] addrspace(3)* @array to float addrspace(3)*
+; IR-NEXT: load float addrspace(3)*
+; PTX: ld.shared.f32 {{%f[0-9]+}}, [array]
+
+  %sum = fadd float %1, %2
+  ret float %sum
+}
+
 declare void @llvm.cuda.syncthreads() #3
 
 attributes #3 = { noduplicate nounwind }