diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -83,12 +83,8 @@
   const DataLayout *DL = nullptr;
   MemoryDependenceResults *MDA = nullptr;
 
-  bool checkArgumentUses(Value &Arg) const;
-  bool isOutArgumentCandidate(Argument &Arg) const;
-
-#ifndef NDEBUG
-  bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const;
-#endif
+  Type *getStoredType(Value &Arg) const;
+  Type *getOutArgumentType(Argument &Arg) const;
 
 public:
   static char ID;
@@ -114,72 +110,61 @@
 
 char AMDGPURewriteOutArguments::ID = 0;
 
-bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const {
+Type *AMDGPURewriteOutArguments::getStoredType(Value &Arg) const {
   const int MaxUses = 10;
   int UseCount = 0;
 
-  for (Use &U : Arg.uses()) {
-    StoreInst *SI = dyn_cast<StoreInst>(U.getUser());
-    if (UseCount > MaxUses)
-      return false;
+  SmallVector<Use *> Worklist;
+  for (Use &U : Arg.uses())
+    Worklist.push_back(&U);
 
-    if (!SI) {
-      auto *BCI = dyn_cast<BitCastInst>(U.getUser());
-      if (!BCI || !BCI->hasOneUse())
-        return false;
-
-      // We don't handle multiple stores currently, so stores to aggregate
-      // pointers aren't worth the trouble since they are canonically split up.
-      Type *DestEltTy = BCI->getType()->getPointerElementType();
-      if (DestEltTy->isAggregateType())
-        return false;
-
-      // We could handle these if we had a convenient way to bitcast between
-      // them.
-      Type *SrcEltTy = Arg.getType()->getPointerElementType();
-      if (SrcEltTy->isArrayTy())
-        return false;
-
-      // Special case handle structs with single members. It is useful to handle
-      // some casts between structs and non-structs, but we can't bitcast
-      // directly between them. Blender uses some casts that look like
-      // { <3 x float> }* to <4 x float>*
-      if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1)))
-        return false;
-
-      // Clang emits OpenCL 3-vector type accesses with a bitcast to the
-      // equivalent 4-element vector and accesses that, and we're looking for
-      // this pointer cast.
-      if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy))
-        return false;
-
-      return checkArgumentUses(*BCI);
+  Type *StoredType = nullptr;
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+
+    if (auto *BCI = dyn_cast<BitCastInst>(U->getUser())) {
+      for (Use &U : BCI->uses())
+        Worklist.push_back(&U);
+      continue;
     }
 
-    if (!SI->isSimple() ||
-        U.getOperandNo() != StoreInst::getPointerOperandIndex())
-      return false;
+    if (auto *SI = dyn_cast<StoreInst>(U->getUser())) {
+      if (UseCount++ > MaxUses)
+        return nullptr;
+
+      if (!SI->isSimple() ||
+          U->getOperandNo() != StoreInst::getPointerOperandIndex())
+        return nullptr;
 
-    ++UseCount;
+      if (StoredType && StoredType != SI->getValueOperand()->getType())
+        return nullptr; // More than one type.
+      StoredType = SI->getValueOperand()->getType();
+      continue;
+    }
+
+    // Unsupported user.
+    return nullptr;
   }
 
-  // Skip unused arguments.
-  return UseCount > 0;
+  return StoredType;
 }
 
-bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const {
+Type *AMDGPURewriteOutArguments::getOutArgumentType(Argument &Arg) const {
   const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs;
   PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType());
 
   // TODO: It might be useful for any out arguments, not just privates.
   if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() &&
                  !AnyAddressSpace) ||
-      Arg.hasByValAttr() || Arg.hasStructRetAttr() ||
-      DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) {
-    return false;
+      Arg.hasByValAttr() || Arg.hasStructRetAttr()) {
+    return nullptr;
   }
 
-  return checkArgumentUses(Arg);
+  Type *StoredType = getStoredType(Arg);
+  if (!StoredType || DL->getTypeStoreSize(StoredType) > MaxOutArgSizeBytes)
+    return nullptr;
+
+  return StoredType;
 }
 
 bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
@@ -187,22 +172,6 @@
   return false;
 }
 
-#ifndef NDEBUG
-bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
-  auto *VT0 = dyn_cast<FixedVectorType>(Ty0);
-  auto *VT1 = dyn_cast<FixedVectorType>(Ty1);
-  if (!VT0 || !VT1)
-    return false;
-
-  if (VT0->getNumElements() != 3 ||
-      VT1->getNumElements() != 4)
-    return false;
-
-  return DL->getTypeSizeInBits(VT0->getElementType()) ==
-         DL->getTypeSizeInBits(VT1->getElementType());
-}
-#endif
-
 bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -215,7 +184,7 @@
   MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
 
   unsigned ReturnNumRegs = 0;
-  SmallSet<int, 4> OutArgIndexes;
+  SmallDenseMap<int, Type *, 4> OutArgIndexes;
   SmallVector<Type *, 4> ReturnTypes;
   Type *RetTy = F.getReturnType();
   if (!RetTy->isVoidTy()) {
@@ -227,12 +196,12 @@
     ReturnTypes.push_back(RetTy);
   }
 
-  SmallVector<Argument *, 4> OutArgs;
+  SmallVector<std::pair<Argument *, Type *>, 4> OutArgs;
   for (Argument &Arg : F.args()) {
-    if (isOutArgumentCandidate(Arg)) {
+    if (Type *Ty = getOutArgumentType(Arg)) {
       LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg
                         << " in function " << F.getName() << '\n');
-      OutArgs.push_back(&Arg);
+      OutArgs.push_back({&Arg, Ty});
     }
   }
 
@@ -264,11 +233,12 @@
     // first. On the second iteration we've removed that out clobbering argument
     // (by effectively moving it into another function) and will find the second
     // argument is OK to move.
-    for (Argument *OutArg : OutArgs) {
+    for (const auto &Pair : OutArgs) {
       bool ThisReplaceable = true;
       SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores;
 
-      Type *ArgTy = OutArg->getType()->getPointerElementType();
+      Argument *OutArg = Pair.first;
+      Type *ArgTy = Pair.second;
 
       // Skip this argument if converting it will push us over the register
       // count to return limit.
@@ -324,7 +294,7 @@
 
       if (ThisReplaceable) {
         ReturnTypes.push_back(ArgTy);
-        OutArgIndexes.insert(OutArg->getArgNo());
+        OutArgIndexes.insert({OutArg->getArgNo(), ArgTy});
         ++NumOutArgumentsReplaced;
         Changing = true;
       }
@@ -376,32 +346,8 @@
     if (RetVal)
       NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++);
 
-    for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) {
-      Argument *Arg = ReturnPoint.first;
-      Value *Val = ReturnPoint.second;
-      Type *EltTy = Arg->getType()->getPointerElementType();
-      if (Val->getType() != EltTy) {
-        Type *EffectiveEltTy = EltTy;
-        if (StructType *CT = dyn_cast<StructType>(EltTy)) {
-          assert(CT->getNumElements() == 1);
-          EffectiveEltTy = CT->getElementType(0);
-        }
-
-        if (DL->getTypeSizeInBits(EffectiveEltTy) !=
-            DL->getTypeSizeInBits(Val->getType())) {
-          assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
-          Val = B.CreateShuffleVector(Val, ArrayRef<int>{0, 1, 2});
-        }
-
-        Val = B.CreateBitCast(Val, EffectiveEltTy);
-
-        // Re-create single element composite.
-        if (EltTy != EffectiveEltTy)
-          Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0);
-      }
-
-      NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++);
-    }
+    for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second)
+      NewRetVal = B.CreateInsertValue(NewRetVal, ReturnPoint.second, RetIdx++);
 
     if (RetVal)
       RI->setOperand(0, NewRetVal);
@@ -433,7 +379,7 @@
 
     PointerType *ArgType = cast<PointerType>(Arg.getType());
 
-    auto *EltTy = ArgType->getPointerElementType();
+    Type *EltTy = OutArgIndexes[Arg.getArgNo()];
     const auto Align =
         DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
 
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-any-address-space-out-arguments -amdgpu-rewrite-out-arguments < %s | FileCheck %s
 
 ; CHECK: %void_one_out_non_private_arg_i32_1_use = type { i32 }
-; CHECK: %bitcast_pointer_as1 = type { <3 x i32> }
+; CHECK: %bitcast_pointer_as1 = type { <4 x i32> }
 
 ; CHECK-LABEL: define private %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(i32 addrspace(1)* %val) #0 {
 ; CHECK-NEXT: ret %void_one_out_non_private_arg_i32_1_use zeroinitializer
@@ -19,9 +19,8 @@
 ; CHECK-LABEL: define private %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* %out) #0 {
 ; CHECK-NEXT: %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef
 ; CHECK-NEXT: %bitcast = bitcast <3 x i32> addrspace(1)* %out to <4 x i32> addrspace(1)*
-; CHECK-NEXT: %1 = shufflevector <4 x i32> %load, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT: %2 = insertvalue %bitcast_pointer_as1 undef, <3 x i32> %1, 0
-; CHECK-NEXT: ret %bitcast_pointer_as1 %2
+; CHECK-NEXT: %1 = insertvalue %bitcast_pointer_as1 undef, <4 x i32> %load, 0
+; CHECK-NEXT: ret %bitcast_pointer_as1 %1
 
 ; CHECK-LABEL: define void @bitcast_pointer_as1(<3 x i32> addrspace(1)* %0) #1 {
 ; CHECK-NEXT: %2 = call %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* undef)
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -814,16 +814,16 @@
 ; CHECK-SAME: (void ()** [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FUNC:%.*]] = load i32 ()*, i32 ()** undef, align 8
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast void ()** [[OUT]] to i32 ()**
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 ()* [[FUNC]] to void ()*
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] undef, void ()* [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] undef, i32 ()* [[FUNC]], 0
+; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type
 ; CHECK-SAME: (void ()** [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(void ()** undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store void ()* [[TMP3]], void ()** [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast void ()** [[TMP0]] to i32 ()**
+; CHECK-NEXT:    store i32 ()* [[TMP3]], i32 ()** [[TMP4]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -925,16 +925,16 @@
 ; CHECK-SAME: (<3 x i32>* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <3 x i32>* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] undef, <3 x i32> [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] undef, <4 x i32> [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32
 ; CHECK-SAME: (<3 x i32>* [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(<3 x i32>* undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]], 0
-; CHECK-NEXT:    store <3 x i32> [[TMP3]], <3 x i32>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <3 x i32>* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -942,17 +942,16 @@
 ; CHECK-SAME: (<3 x float>* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast <3 x float>* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] undef, <4 x i32> [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32
 ; CHECK-SAME: (<3 x float>* [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(<3 x float>* undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    store <3 x float> [[TMP3]], <3 x float>* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <3 x float>* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -960,32 +959,50 @@
 ; CHECK-SAME: (float* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast float* [[OUT]] to i32*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[LOAD]] to float
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] undef, float [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] undef, i32 [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32
 ; CHECK-SAME: (float* [[TMP0:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(float* undef)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F32]] [[TMP2]], 0
-; CHECK-NEXT:    store float [[TMP3]], float* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP0]] to i32*
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16.body
 ; CHECK-SAME: (half* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast half* [[OUT]] to i32*
-; CHECK-NEXT:    store i32 [[LOAD]], i32* [[BITCAST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F16:%.*]] undef, i32 [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F16]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
+; CHECK-SAME: (half* [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(half* undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F16]] [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast half* [[TMP0]] to i32*
+; CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32.body
 ; CHECK-SAME: (i32* [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile half, half addrspace(1)* undef, align 2
 ; CHECK-NEXT:    [[BITCAST:%.*]] = bitcast i32* [[OUT]] to half*
-; CHECK-NEXT:    store half [[LOAD]], half* [[BITCAST]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_F16_I32:%.*]] undef, half [[LOAD]], 0
+; CHECK-NEXT:    ret [[BITCAST_POINTER_F16_I32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
+; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(i32* undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_F16_I32]] [[TMP2]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to half*
+; CHECK-NEXT:    store half [[TMP3]], half* [[TMP4]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -993,17 +1010,16 @@
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[EXTRACTVEC]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32
 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(%struct.v3f32* undef, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1011,52 +1027,48 @@
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[VALUE]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[EXTRACTVEC]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] undef, [[STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] undef, <4 x i32> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32
 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(%struct.v3f32* undef, <3 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32.body
 ; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] undef, [[STRUCT_V4F32]] [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32
 ; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(%struct.v4f32* undef, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V4F32:%.*]] [[TMP4]], %struct.v4f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32.body
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VALUE]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] undef, [[STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] undef, <4 x i32> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32
 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(%struct.v3f32* undef, <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1064,62 +1076,97 @@
 ; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_V4F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] undef, [[STRUCT_V4F32]] [[TMP1]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32
 ; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(%struct.v4f32* undef, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V4F32:%.*]] [[TMP4]], %struct.v4f32* [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32.body
 ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <2 x float>*
-; CHECK-NEXT:    store <2 x float> [[VALUE]], <2 x float>* [[CAST]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V2F32:%.*]] undef, <2 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
+; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(%struct.v3f32* undef, <2 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32.body
 ; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC]], <4 x float>* [[CAST]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
+; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(%struct.v3f32.f32* undef, <3 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32.body
 ; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    store <4 x float> [[VALUE]], <4 x float>* [[CAST]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
+; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(%struct.v3f32.f32* undef, <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32.body
 ; CHECK-SAME: (%struct.i128* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast %struct.i128* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[VALUE]] to i128
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_I128:%.*]] undef, i128 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] undef, [[STRUCT_I128]] [[TMP2]], 0
-; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32
 ; CHECK-SAME: (%struct.i128* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(%struct.i128* undef, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_I128:%.*]] [[TMP4]], %struct.i128* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.i128* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32.body
 ; CHECK-SAME: ([4 x i32]* [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast [4 x i32]* [[OUT]] to [4 x float]*
-; CHECK-NEXT:    store [4 x float] [[VALUE]], [4 x float]* [[CAST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_ARRAY_V4I32_V4F32:%.*]] undef, [4 x float] [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
+; CHECK-SAME: ([4 x i32]* [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body([4 x i32]* undef, [4 x float] [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast [4 x i32]* [[TMP0]] to [4 x float]*
+; CHECK-NEXT:    store [4 x float] [[TMP4]], [4 x float]* [[TMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
@@ -1130,30 +1177,36 @@
 ; CHECK:       ret0:
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
 ; CHECK-NEXT:    [[CAST0:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[EXTRACTVEC]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[TMP1]], 0
-; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0
+; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP0]]
 ; CHECK:       ret1:
 ; CHECK-NEXT:    [[CAST1:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>*
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <4 x float>, <4 x float> addrspace(1)* undef, align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[LOAD]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[STRUCT_V3F32]] undef, <3 x float> [[TMP3]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] undef, [[STRUCT_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] undef, <4 x float> [[LOAD]], 0
+; CHECK-NEXT:    ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32
 ; CHECK-SAME: (i1 [[TMP0:%.*]], %struct.v3f32* [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], %struct.v3f32* undef, <3 x float> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32:%.*]] [[TMP5]], %struct.v3f32* [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.v3f32* [[TMP1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
+; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32.body
 ; CHECK-SAME: (<3 x float>* [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[CAST:%.*]] = bitcast <3 x float>* [[OUT]] to %struct.v3f32*
-; CHECK-NEXT:    store [[STRUCT_V3F32]] [[VALUE]], %struct.v3f32* [[CAST]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_V3F32_STRUCT_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[VALUE]], 0
+; CHECK-NEXT:    ret [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP1]]
+;
+;
+; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
+; CHECK-SAME: (<3 x float>* [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(<3 x float>* undef, [[STRUCT_V3F32]] [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <3 x float>* [[TMP0]] to %struct.v3f32*
+; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], %struct.v3f32* [[TMP5]], align 16
 ; CHECK-NEXT:    ret void
 ;