diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -83,12 +83,8 @@ const DataLayout *DL = nullptr; MemoryDependenceResults *MDA = nullptr; - bool checkArgumentUses(Value &Arg) const; - bool isOutArgumentCandidate(Argument &Arg) const; - -#ifndef NDEBUG - bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const; -#endif + Type *getStoredType(Value &Arg) const; + Type *getOutArgumentType(Argument &Arg) const; public: static char ID; @@ -114,72 +110,61 @@ char AMDGPURewriteOutArguments::ID = 0; -bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const { +Type *AMDGPURewriteOutArguments::getStoredType(Value &Arg) const { const int MaxUses = 10; int UseCount = 0; - for (Use &U : Arg.uses()) { - StoreInst *SI = dyn_cast(U.getUser()); - if (UseCount > MaxUses) - return false; + SmallVector Worklist; + for (Use &U : Arg.uses()) + Worklist.push_back(&U); - if (!SI) { - auto *BCI = dyn_cast(U.getUser()); - if (!BCI || !BCI->hasOneUse()) - return false; - - // We don't handle multiple stores currently, so stores to aggregate - // pointers aren't worth the trouble since they are canonically split up. - Type *DestEltTy = BCI->getType()->getPointerElementType(); - if (DestEltTy->isAggregateType()) - return false; - - // We could handle these if we had a convenient way to bitcast between - // them. - Type *SrcEltTy = Arg.getType()->getPointerElementType(); - if (SrcEltTy->isArrayTy()) - return false; - - // Special case handle structs with single members. It is useful to handle - // some casts between structs and non-structs, but we can't bitcast - // directly between them. Blender uses some casts that look like - // { <3 x float> }* to <4 x float>* - if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1))) - return false; - - // Clang emits OpenCL 3-vector type accesses with a bitcast to the - // equivalent 4-element vector and accesses that, and we're looking for - // this pointer cast. - if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy)) - return false; - - return checkArgumentUses(*BCI); + Type *StoredType = nullptr; + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + + if (auto *BCI = dyn_cast(U->getUser())) { + for (Use &U : BCI->uses()) + Worklist.push_back(&U); + continue; } - if (!SI->isSimple() || - U.getOperandNo() != StoreInst::getPointerOperandIndex()) - return false; + if (auto *SI = dyn_cast(U->getUser())) { + if (UseCount++ > MaxUses) + return nullptr; + + if (!SI->isSimple() || + U->getOperandNo() != StoreInst::getPointerOperandIndex()) + return nullptr; - ++UseCount; + if (StoredType && StoredType != SI->getValueOperand()->getType()) + return nullptr; // More than one type. + StoredType = SI->getValueOperand()->getType(); + continue; + } + + // Unsupported user. + return nullptr; } - // Skip unused arguments. - return UseCount > 0; + return StoredType; } -bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const { +Type *AMDGPURewriteOutArguments::getOutArgumentType(Argument &Arg) const { const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs; PointerType *ArgTy = dyn_cast(Arg.getType()); // TODO: It might be useful for any out arguments, not just privates. if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() && !AnyAddressSpace) || - Arg.hasByValAttr() || Arg.hasStructRetAttr() || - DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) { - return false; + Arg.hasByValAttr() || Arg.hasStructRetAttr()) { + return nullptr; } - return checkArgumentUses(Arg); + Type *StoredType = getStoredType(Arg); + if (!StoredType || DL->getTypeStoreSize(StoredType) > MaxOutArgSizeBytes) + return nullptr; + + return StoredType; } bool AMDGPURewriteOutArguments::doInitialization(Module &M) { @@ -187,22 +172,6 @@ return false; } -#ifndef NDEBUG -bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const { - auto *VT0 = dyn_cast(Ty0); - auto *VT1 = dyn_cast(Ty1); - if (!VT0 || !VT1) - return false; - - if (VT0->getNumElements() != 3 || - VT1->getNumElements() != 4) - return false; - - return DL->getTypeSizeInBits(VT0->getElementType()) == - DL->getTypeSizeInBits(VT1->getElementType()); -} -#endif - bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -215,7 +184,7 @@ MDA = &getAnalysis().getMemDep(); unsigned ReturnNumRegs = 0; - SmallSet OutArgIndexes; + SmallDenseMap OutArgIndexes; SmallVector ReturnTypes; Type *RetTy = F.getReturnType(); if (!RetTy->isVoidTy()) { @@ -227,12 +196,12 @@ ReturnTypes.push_back(RetTy); } - SmallVector OutArgs; + SmallVector, 4> OutArgs; for (Argument &Arg : F.args()) { - if (isOutArgumentCandidate(Arg)) { + if (Type *Ty = getOutArgumentType(Arg)) { LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg << " in function " << F.getName() << '\n'); - OutArgs.push_back(&Arg); + OutArgs.push_back({&Arg, Ty}); } } @@ -264,11 +233,12 @@ // first. On the second iteration we've removed that out clobbering argument // (by effectively moving it into another function) and will find the second // argument is OK to move. - for (Argument *OutArg : OutArgs) { + for (const auto &Pair : OutArgs) { bool ThisReplaceable = true; SmallVector, 4> ReplaceableStores; - Type *ArgTy = OutArg->getType()->getPointerElementType(); + Argument *OutArg = Pair.first; + Type *ArgTy = Pair.second; // Skip this argument if converting it will push us over the register // count to return limit. @@ -324,7 +294,7 @@ if (ThisReplaceable) { ReturnTypes.push_back(ArgTy); - OutArgIndexes.insert(OutArg->getArgNo()); + OutArgIndexes.insert({OutArg->getArgNo(), ArgTy}); ++NumOutArgumentsReplaced; Changing = true; } @@ -376,32 +346,8 @@ if (RetVal) NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++); - for (std::pair ReturnPoint : Replacement.second) { - Argument *Arg = ReturnPoint.first; - Value *Val = ReturnPoint.second; - Type *EltTy = Arg->getType()->getPointerElementType(); - if (Val->getType() != EltTy) { - Type *EffectiveEltTy = EltTy; - if (StructType *CT = dyn_cast(EltTy)) { - assert(CT->getNumElements() == 1); - EffectiveEltTy = CT->getElementType(0); - } - - if (DL->getTypeSizeInBits(EffectiveEltTy) != - DL->getTypeSizeInBits(Val->getType())) { - assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType())); - Val = B.CreateShuffleVector(Val, ArrayRef{0, 1, 2}); - } - - Val = B.CreateBitCast(Val, EffectiveEltTy); - - // Re-create single element composite. - if (EltTy != EffectiveEltTy) - Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0); - } - - NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++); - } + for (std::pair ReturnPoint : Replacement.second) + NewRetVal = B.CreateInsertValue(NewRetVal, ReturnPoint.second, RetIdx++); if (RetVal) RI->setOperand(0, NewRetVal); @@ -433,7 +379,7 @@ PointerType *ArgType = cast(Arg.getType()); - auto *EltTy = ArgType->getPointerElementType(); + Type *EltTy = OutArgIndexes[Arg.getArgNo()]; const auto Align = DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy); diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll --- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments-address-space.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-any-address-space-out-arguments -amdgpu-rewrite-out-arguments < %s | FileCheck %s ; CHECK: %void_one_out_non_private_arg_i32_1_use = type { i32 } -; CHECK: %bitcast_pointer_as1 = type { <3 x i32> } +; CHECK: %bitcast_pointer_as1 = type { <4 x i32> } ; CHECK-LABEL: define private %void_one_out_non_private_arg_i32_1_use @void_one_out_non_private_arg_i32_1_use.body(i32 addrspace(1)* %val) #0 { ; CHECK-NEXT: ret %void_one_out_non_private_arg_i32_1_use zeroinitializer @@ -19,9 +19,8 @@ ; CHECK-LABEL: define private %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* %out) #0 { ; CHECK-NEXT: %load = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef ; CHECK-NEXT: %bitcast = bitcast <3 x i32> addrspace(1)* %out to <4 x i32> addrspace(1)* -; CHECK-NEXT: %1 = shufflevector <4 x i32> %load, <4 x i32> poison, <3 x i32> -; CHECK-NEXT: %2 = insertvalue %bitcast_pointer_as1 undef, <3 x i32> %1, 0 -; CHECK-NEXT: ret %bitcast_pointer_as1 %2 +; CHECK-NEXT: %1 = insertvalue %bitcast_pointer_as1 undef, <4 x i32> %load, 0 +; CHECK-NEXT: ret %bitcast_pointer_as1 %1 ; CHECK-LABEL: define void @bitcast_pointer_as1(<3 x i32> addrspace(1)* %0) #1 { ; CHECK-NEXT: %2 = call %bitcast_pointer_as1 @bitcast_pointer_as1.body(<3 x i32> addrspace(1)* undef) diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll --- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll @@ -814,16 +814,16 @@ ; CHECK-SAME: (void ()** [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[FUNC:%.*]] = load i32 ()*, i32 ()** undef, align 8 ; CHECK-NEXT: [[CAST:%.*]] = bitcast void ()** [[OUT]] to i32 ()** -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 ()* [[FUNC]] to void ()* -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] undef, void ()* [[TMP1]], 0 -; CHECK-NEXT: ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] undef, i32 ()* [[FUNC]], 0 +; CHECK-NEXT: ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type ; CHECK-SAME: (void ()** [[TMP0:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(void ()** undef) ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]], 0 -; CHECK-NEXT: store void ()* [[TMP3]], void ()** [[TMP0]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast void ()** [[TMP0]] to i32 ()** +; CHECK-NEXT: store i32 ()* [[TMP3]], i32 ()** [[TMP4]], align 8 ; CHECK-NEXT: ret void ; ; @@ -925,16 +925,16 @@ ; CHECK-SAME: (<3 x i32>* [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef, align 16 ; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <3 x i32>* [[OUT]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD]], <4 x i32> poison, <3 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] undef, <3 x i32> [[TMP1]], 0 -; CHECK-NEXT: ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] undef, <4 x i32> [[LOAD]], 0 +; CHECK-NEXT: ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32 ; CHECK-SAME: (<3 x i32>* [[TMP0:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(<3 x i32>* undef) ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]], 0 -; CHECK-NEXT: store <3 x i32> [[TMP3]], <3 x i32>* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <3 x i32>* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16 ; CHECK-NEXT: ret void ; ; @@ -942,17 +942,16 @@ ; CHECK-SAME: (<3 x float>* [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LOAD:%.*]] = load volatile <4 x i32>, <4 x i32> addrspace(1)* undef, align 16 ; CHECK-NEXT: [[BITCAST:%.*]] = bitcast <3 x float>* [[OUT]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[LOAD]], <4 x i32> poison, <3 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float> -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0 -; CHECK-NEXT: ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] undef, <4 x i32> [[LOAD]], 0 +; CHECK-NEXT: ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32 ; CHECK-SAME: (<3 x float>* [[TMP0:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(<3 x float>* undef) ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3F32]] [[TMP2]], 0 -; CHECK-NEXT: store <3 x float> [[TMP3]], <3 x float>* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <3 x float>* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 16 ; CHECK-NEXT: ret void ; ; @@ -960,32 +959,50 @@ ; CHECK-SAME: (float* [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4 ; CHECK-NEXT: [[BITCAST:%.*]] = bitcast float* [[OUT]] to i32* -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[LOAD]] to float -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] undef, float [[TMP1]], 0 -; CHECK-NEXT: ret [[BITCAST_POINTER_I32_F32]] [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] undef, i32 [[LOAD]], 0 +; CHECK-NEXT: ret [[BITCAST_POINTER_I32_F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32 ; CHECK-SAME: (float* [[TMP0:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(float* undef) ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F32]] [[TMP2]], 0 -; CHECK-NEXT: store float [[TMP3]], float* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP0]] to i32* +; CHECK-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16 +; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16.body ; CHECK-SAME: (half* [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LOAD:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4 ; CHECK-NEXT: [[BITCAST:%.*]] = bitcast half* [[OUT]] to i32* -; CHECK-NEXT: store i32 [[LOAD]], i32* [[BITCAST]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F16:%.*]] undef, i32 [[LOAD]], 0 +; CHECK-NEXT: ret [[BITCAST_POINTER_I32_F16]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16 +; CHECK-SAME: (half* [[TMP0:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(half* undef) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F16]] [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast half* [[TMP0]] to i32* +; CHECK-NEXT: store i32 [[TMP3]], i32* [[TMP4]], align 4 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32 +; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32.body ; CHECK-SAME: (i32* [[OUT:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LOAD:%.*]] = load volatile half, half addrspace(1)* undef, align 2 ; CHECK-NEXT: [[BITCAST:%.*]] = bitcast i32* [[OUT]] to half* -; CHECK-NEXT: store half [[LOAD]], half* [[BITCAST]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_F16_I32:%.*]] undef, half [[LOAD]], 0 +; CHECK-NEXT: ret [[BITCAST_POINTER_F16_I32]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32 +; CHECK-SAME: (i32* [[TMP0:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(i32* undef) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_F16_I32]] [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP0]] to half* +; CHECK-NEXT: store half [[TMP3]], half* [[TMP4]], align 2 ; CHECK-NEXT: ret void ; ; @@ -993,17 +1010,16 @@ ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[EXTRACTVEC]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[TMP2]], 0 -; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(%struct.v3f32* undef, <3 x float> [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]], 0 -; CHECK-NEXT: store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; @@ -1011,52 +1027,48 @@ ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[VALUE]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[EXTRACTVEC]], <4 x i32> poison, <3 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float> -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] undef, [[STRUCT_V3F32]] [[TMP3]], 0 -; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] undef, <4 x i32> [[EXTRACTVEC]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(%struct.v3f32* undef, <3 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP3]], 0 -; CHECK-NEXT: store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32.body ; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] undef, [[STRUCT_V4F32]] [[TMP1]], 0 -; CHECK-NEXT: ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32 ; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(%struct.v4f32* undef, <4 x float> [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP3]], 0 -; CHECK-NEXT: store [[STRUCT_V4F32:%.*]] [[TMP4]], %struct.v4f32* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32.body ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VALUE]], <4 x i32> poison, <3 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> [[TMP1]] to <3 x float> -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP2]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] undef, [[STRUCT_V3F32]] [[TMP3]], 0 -; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] undef, <4 x i32> [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32 ; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(%struct.v3f32* undef, <4 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP3]], 0 -; CHECK-NEXT: store [[STRUCT_V3F32:%.*]] [[TMP4]], %struct.v3f32* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; @@ -1064,62 +1076,97 @@ ; CHECK-SAME: (%struct.v4f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v4f32* [[OUT]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_V4F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] undef, [[STRUCT_V4F32]] [[TMP1]], 0 -; CHECK-NEXT: ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32 ; CHECK-SAME: (%struct.v4f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(%struct.v4f32* undef, <3 x float> [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP3]], 0 -; CHECK-NEXT: store [[STRUCT_V4F32:%.*]] [[TMP4]], %struct.v4f32* [[TMP0]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v4f32* [[TMP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32 +; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32.body ; CHECK-SAME: (%struct.v3f32* [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v3f32* [[OUT]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[VALUE]], <2 x float>* [[CAST]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V2F32:%.*]] undef, <2 x float> [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32 +; CHECK-SAME: (%struct.v3f32* [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(%struct.v3f32* undef, <2 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v3f32* [[TMP0]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 8 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32 +; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32.body ; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], <4 x float>* [[CAST]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32 +; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(%struct.v3f32.f32* undef, <3 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32 +; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32.body ; CHECK-SAME: (%struct.v3f32.f32* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.v3f32.f32* [[OUT]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[VALUE]], <4 x float>* [[CAST]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32 +; CHECK-SAME: (%struct.v3f32.f32* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(%struct.v3f32.f32* undef, <4 x float> [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.v3f32.f32* [[TMP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32.body ; CHECK-SAME: (%struct.i128* [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast %struct.i128* [[OUT]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[VALUE]] to i128 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_I128:%.*]] undef, i128 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] undef, [[STRUCT_I128]] [[TMP2]], 0 -; CHECK-NEXT: ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] undef, <4 x float> [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32 ; CHECK-SAME: (%struct.i128* [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(%struct.i128* undef, <4 x float> [[TMP1]]) ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]], 0 -; CHECK-NEXT: store [[STRUCT_I128:%.*]] [[TMP4]], %struct.i128* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.i128* [[TMP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 16 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32 +; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32.body ; CHECK-SAME: ([4 x i32]* [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast [4 x i32]* [[OUT]] to [4 x float]* -; CHECK-NEXT: store [4 x float] [[VALUE]], [4 x float]* [[CAST]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_ARRAY_V4I32_V4F32:%.*]] undef, [4 x float] [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32 +; CHECK-SAME: ([4 x i32]* [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body([4 x i32]* undef, [4 x float] [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast [4 x i32]* [[TMP0]] to [4 x float]* +; CHECK-NEXT: store [4 x float] [[TMP4]], [4 x float]* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; ; @@ -1130,30 +1177,36 @@ ; CHECK: ret0: ; CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: [[CAST0:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>* -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[EXTRACTVEC]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_V3F32:%.*]] undef, <3 x float> [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[TMP1]], 0 -; CHECK-NEXT: ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP2]] +; CHECK-NEXT: [[TMP0:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] undef, <4 x float> [[EXTRACTVEC]], 0 +; CHECK-NEXT: ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP0]] ; CHECK: ret1: ; CHECK-NEXT: [[CAST1:%.*]] = bitcast %struct.v3f32* [[OUT]] to <4 x float>* ; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, <4 x float> addrspace(1)* undef, align 16 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[LOAD]], <4 x float> poison, <3 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_V3F32]] undef, <3 x float> [[TMP3]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] undef, [[STRUCT_V3F32]] [[TMP4]], 0 -; CHECK-NEXT: ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP5]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] undef, <4 x float> [[LOAD]], 0 +; CHECK-NEXT: ret [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]] ; ; ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32 ; CHECK-SAME: (i1 [[TMP0:%.*]], %struct.v3f32* [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] { ; CHECK-NEXT: [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], %struct.v3f32* undef, <3 x float> [[TMP2]]) ; CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP4]], 0 -; CHECK-NEXT: store [[STRUCT_V3F32:%.*]] [[TMP5]], %struct.v3f32* [[TMP1]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.v3f32* [[TMP1]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 16 ; CHECK-NEXT: ret void ; ; -; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32 +; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32.body ; CHECK-SAME: (<3 x float>* [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[CAST:%.*]] = bitcast <3 x float>* [[OUT]] to %struct.v3f32* -; CHECK-NEXT: store [[STRUCT_V3F32]] [[VALUE]], %struct.v3f32* [[CAST]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[BITCAST_V3F32_STRUCT_V3F32:%.*]] undef, [[STRUCT_V3F32]] [[VALUE]], 0 +; CHECK-NEXT: ret [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP1]] +; +; +; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32 +; CHECK-SAME: (<3 x float>* [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(<3 x float>* undef, [[STRUCT_V3F32]] [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <3 x float>* [[TMP0]] to %struct.v3f32* +; CHECK-NEXT: store [[STRUCT_V3F32]] [[TMP4]], %struct.v3f32* [[TMP5]], align 16 ; CHECK-NEXT: ret void ;