This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/Vectorize/
-
Transforms/
-
Vectorize/
-
VectorCombine.cpp
-
test/Transforms/VectorCombine/X86/
-
Transforms/
-
VectorCombine/
-
X86/
-
load.ll

Differential D93238

[VectorCombine] make load transform poison-safe
ClosedPublic

Authored by spatel on Dec 14 2020, 12:21 PM.

Download Raw Diff

Details

Reviewers

lebedev.ri
nlopes
aqjune
RKSimon

Commits

rGd399f870b5a9: [VectorCombine] make load transform poison-safe

Summary

As noted in D93229, the transform from scalar load to vector load potentially leaks poison from the extra vector elements that are being loaded.

We could use freeze here (and x86 codegen at least appears to be the same either way), but we already have a shuffle in this logic to optionally change the vector size, so let's allow that instruction to serve both purposes.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

spatel created this revision.Dec 14 2020, 12:21 PM

Herald added subscribers: pengfei, hiraditya, mcrosier. · View Herald TranscriptDec 14 2020, 12:21 PM

spatel requested review of this revision.Dec 14 2020, 12:21 PM

Herald added a project: Restricted Project. · View Herald TranscriptDec 14 2020, 12:21 PM

LGTM, thanks.
(i didn't yet check if alive2 can point out any other problems given existing test coverage)

lebedev.ri accepted this revision.Dec 14 2020, 12:31 PM

This revision is now accepted and ready to land.Dec 14 2020, 12:31 PM

Closed by commit rGd399f870b5a9: [VectorCombine] make load transform poison-safe (authored by spatel). · Explain WhyDec 14 2020, 2:42 PM

This revision was automatically updated to reflect the committed changes.

spatel added a commit: rGd399f870b5a9: [VectorCombine] make load transform poison-safe.

spatel mentioned this in D93229: [VectorCombine] allow peeking through GEPs when creating a vector load.Dec 15 2020, 4:59 AM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Vectorize/

VectorCombine.cpp

20 lines

test/

Transforms/

VectorCombine/

X86/

load.ll

38 lines

Diff 311716

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Show First 20 Lines • Show All 155 Lines • ▼ Show 20 Lines	if (OldCost < NewCost)
return false;		return false;

// It is safe and potentially profitable to load a vector directly:		// It is safe and potentially profitable to load a vector directly:
// inselt undef, load Scalar, 0 --> load VecPtr		// inselt undef, load Scalar, 0 --> load VecPtr
IRBuilder<> Builder(Load);		IRBuilder<> Builder(Load);
Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));		Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS));
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);		Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);

// If the insert type does not match the target's minimum vector type,		// Set everything but element 0 to undef to prevent poison from propagating
// use an identity shuffle to shrink/grow the vector.		// from the extra loaded memory. This will also optionally shrink/grow the
if (Ty != MinVecTy) {		// vector from the loaded size to the output size.
		// We assume this operation has no cost in codegen.
		// Note that we could use freeze to avoid poison problems, but then we might
		// still need a shuffle to change the vector size.
unsigned OutputNumElts = Ty->getNumElements();		unsigned OutputNumElts = Ty->getNumElements();
SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);		SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i)		Mask[0] = 0;
Mask[i] = i;
VecLd = Builder.CreateShuffleVector(VecLd, Mask);		VecLd = Builder.CreateShuffleVector(VecLd, Mask);
}
replaceValue(I, *VecLd);		replaceValue(I, *VecLd);
++NumVecLoad;		++NumVecLoad;
return true;		return true;
}		}

/// Determine which, if any, of the inputs should be replaced by a shuffle		/// Determine which, if any, of the inputs should be replaced by a shuffle
/// followed by extract from a different index.		/// followed by extract from a different index.
ExtractElementInst *VectorCombine::getShuffleExtract(		ExtractElementInst *VectorCombine::getShuffleExtract(
▲ Show 20 Lines • Show All 612 Lines • Show Last 20 Lines

llvm/test/Transforms/VectorCombine/X86/load.ll

Show First 20 Lines • Show All 169 Lines • ▼ Show 20 Lines	;
%bc = bitcast <8 x float>* %p to double*		%bc = bitcast <8 x float>* %p to double*
%r = load double, double* %bc, align 32		%r = load double, double* %bc, align 32
ret double %r		ret double %r
}		}

define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {		define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v4f32(		; CHECK-LABEL: @load_f32_insert_v4f32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[P:%.]] to <4 x float>		; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[P:%.]] to <4 x float>
; CHECK-NEXT: [[R:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]		; CHECK-NEXT: ret <4 x float> [[R]]
;		;
%s = load float, float* %p, align 4		%s = load float, float* %p, align 4
%r = insertelement <4 x float> undef, float %s, i32 0		%r = insertelement <4 x float> undef, float %s, i32 0
ret <4 x float> %r		ret <4 x float> %r
}		}

define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {		define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
; CHECK-LABEL: @casted_load_f32_insert_v4f32(		; CHECK-LABEL: @casted_load_f32_insert_v4f32(
; CHECK-NEXT: [[R:%.]] = load <4 x float>, <4 x float> [[P:%.*]], align 4		; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> [[P:%.*]], align 4
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]		; CHECK-NEXT: ret <4 x float> [[R]]
;		;
%b = bitcast <4 x float>* %p to float*		%b = bitcast <4 x float>* %p to float*
%s = load float, float* %b, align 4		%s = load float, float* %b, align 4
%r = insertelement <4 x float> undef, float %s, i32 0		%r = insertelement <4 x float> undef, float %s, i32 0
ret <4 x float> %r		ret <4 x float> %r
}		}

; Element type does not change cost.		; Element type does not change cost.

define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {		define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_i32_insert_v4i32(		; CHECK-LABEL: @load_i32_insert_v4i32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[P:%.]] to <4 x i32>		; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[P:%.]] to <4 x i32>
; CHECK-NEXT: [[R:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x i32> [[R]]		; CHECK-NEXT: ret <4 x i32> [[R]]
;		;
%s = load i32, i32* %p, align 4		%s = load i32, i32* %p, align 4
%r = insertelement <4 x i32> undef, i32 %s, i32 0		%r = insertelement <4 x i32> undef, i32 %s, i32 0
ret <4 x i32> %r		ret <4 x i32> %r
}		}

; Pointer type does not change cost.		; Pointer type does not change cost.

define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {		define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
; CHECK-LABEL: @casted_load_i32_insert_v4i32(		; CHECK-LABEL: @casted_load_i32_insert_v4i32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast <16 x i8> [[P:%.]] to <4 x i32>		; CHECK-NEXT: [[TMP1:%.]] = bitcast <16 x i8> [[P:%.]] to <4 x i32>
; CHECK-NEXT: [[R:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x i32> [[R]]		; CHECK-NEXT: ret <4 x i32> [[R]]
;		;
%b = bitcast <16 x i8>* %p to i32*		%b = bitcast <16 x i8>* %p to i32*
%s = load i32, i32* %b, align 4		%s = load i32, i32* %b, align 4
%r = insertelement <4 x i32> undef, i32 %s, i32 0		%r = insertelement <4 x i32> undef, i32 %s, i32 0
ret <4 x i32> %r		ret <4 x i32> %r
}		}

; This is canonical form for vector element access.		; This is canonical form for vector element access.

define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {		define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @gep00_load_f32_insert_v4f32(		; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
; CHECK-NEXT: [[R:%.]] = load <4 x float>, <4 x float> [[P:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> [[P:%.*]], align 16
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]		; CHECK-NEXT: ret <4 x float> [[R]]
;		;
%gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0		%gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
%s = load float, float* %gep, align 16		%s = load float, float* %gep, align 16
%r = insertelement <4 x float> undef, float %s, i64 0		%r = insertelement <4 x float> undef, float %s, i64 0
ret <4 x float> %r		ret <4 x float> %r
}		}

; Should work with addrspace as well.		; Should work with addrspace as well.

define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) {		define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(		; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
; CHECK-NEXT: [[R:%.]] = load <4 x float>, <4 x float> addrspace(44) [[P:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = load <4 x float>, <4 x float> addrspace(44) [[P:%.*]], align 16
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]		; CHECK-NEXT: ret <4 x float> [[R]]
;		;
%gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0		%gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
%s = load float, float addrspace(44)* %gep, align 16		%s = load float, float addrspace(44)* %gep, align 16
%r = insertelement <4 x float> undef, float %s, i64 0		%r = insertelement <4 x float> undef, float %s, i64 0
ret <4 x float> %r		ret <4 x float> %r
}		}

; If there are enough dereferenceable bytes, we can offset the vector load.		; If there are enough dereferenceable bytes, we can offset the vector load.

define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {		define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(		; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[P:%.*]], i64 0, i64 1		; CHECK-NEXT: [[GEP:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[P:%.*]], i64 0, i64 1
; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[GEP]] to <8 x i16>*		; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[GEP]] to <8 x i16>*
; CHECK-NEXT: [[R:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 2		; CHECK-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 2
		; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i16> [[R]]		; CHECK-NEXT: ret <8 x i16> [[R]]
;		;
%gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1		%gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
%s = load i16, i16* %gep, align 2		%s = load i16, i16* %gep, align 2
%r = insertelement <8 x i16> undef, i16 %s, i64 0		%r = insertelement <8 x i16> undef, i16 %s, i64 0
ret <8 x i16> %r		ret <8 x i16> %r
}		}

Show All 13 Lines
}		}

; If there are enough dereferenceable bytes, we can offset the vector load.		; If there are enough dereferenceable bytes, we can offset the vector load.

define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {		define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(		; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[P:%.*]], i64 1, i64 0		; CHECK-NEXT: [[GEP:%.]] = getelementptr inbounds <8 x i16>, <8 x i16> [[P:%.*]], i64 1, i64 0
; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[GEP]] to <8 x i16>*		; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[GEP]] to <8 x i16>*
; CHECK-NEXT: [[R:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 16		; CHECK-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 16
		; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i16> [[R]]		; CHECK-NEXT: ret <8 x i16> [[R]]
;		;
%gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0		%gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
%s = load i16, i16* %gep, align 16		%s = load i16, i16* %gep, align 16
%r = insertelement <8 x i16> undef, i16 %s, i64 0		%r = insertelement <8 x i16> undef, i16 %s, i64 0
ret <8 x i16> %r		ret <8 x i16> %r
}		}

▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	;
%r = insertelement <4 x float> undef, float %s, i32 0		%r = insertelement <4 x float> undef, float %s, i32 0
ret <4 x float> %r		ret <4 x float> %r
}		}

define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {		define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_i32_insert_v8i32(		; CHECK-LABEL: @load_i32_insert_v8i32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[P:%.]] to <4 x i32>		; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[P:%.]] to <4 x i32>
; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x i32>, <4 x i32> [[TMP1]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i32> [[R]]		; CHECK-NEXT: ret <8 x i32> [[R]]
;		;
%s = load i32, i32* %p, align 4		%s = load i32, i32* %p, align 4
%r = insertelement <8 x i32> undef, i32 %s, i32 0		%r = insertelement <8 x i32> undef, i32 %s, i32 0
ret <8 x i32> %r		ret <8 x i32> %r
}		}

define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {		define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
; CHECK-LABEL: @casted_load_i32_insert_v8i32(		; CHECK-LABEL: @casted_load_i32_insert_v8i32(
; CHECK-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> [[P:%.*]], align 4		; CHECK-NEXT: [[TMP1:%.]] = load <4 x i32>, <4 x i32> [[P:%.*]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i32> [[R]]		; CHECK-NEXT: ret <8 x i32> [[R]]
;		;
%b = bitcast <4 x i32>* %p to i32*		%b = bitcast <4 x i32>* %p to i32*
%s = load i32, i32* %b, align 4		%s = load i32, i32* %b, align 4
%r = insertelement <8 x i32> undef, i32 %s, i32 0		%r = insertelement <8 x i32> undef, i32 %s, i32 0
ret <8 x i32> %r		ret <8 x i32> %r
}		}

define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {		define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v16f32(		; CHECK-LABEL: @load_f32_insert_v16f32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[P:%.]] to <4 x float>		; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[P:%.]] to <4 x float>
; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <16 x float> [[R]]		; CHECK-NEXT: ret <16 x float> [[R]]
;		;
%s = load float, float* %p, align 4		%s = load float, float* %p, align 4
%r = insertelement <16 x float> undef, float %s, i32 0		%r = insertelement <16 x float> undef, float %s, i32 0
ret <16 x float> %r		ret <16 x float> %r
}		}

define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {		define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v2f32(		; CHECK-LABEL: @load_f32_insert_v2f32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[P:%.]] to <4 x float>		; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[P:%.]] to <4 x float>
; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 undef>
; CHECK-NEXT: ret <2 x float> [[R]]		; CHECK-NEXT: ret <2 x float> [[R]]
;		;
%s = load float, float* %p, align 4		%s = load float, float* %p, align 4
%r = insertelement <2 x float> undef, float %s, i32 0		%r = insertelement <2 x float> undef, float %s, i32 0
ret <2 x float> %r		ret <2 x float> %r
}		}

; Negative test - suppress load widening for asan/hwasan/memtag/tsan.		; Negative test - suppress load widening for asan/hwasan/memtag/tsan.
Show All 37 Lines	;
%result1 = insertelement <2 x float> %result0, float %t5, i32 1		%result1 = insertelement <2 x float> %result0, float %t5, i32 1
store <2 x float> %result1, <2 x float>* %resultptr, align 8		store <2 x float> %result1, <2 x float>* %resultptr, align 8
ret void		ret void
}		}

define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {		define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(		; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast <2 x float> [[P:%.]] to <4 x float>		; CHECK-NEXT: [[TMP1:%.]] = bitcast <2 x float> [[P:%.]] to <4 x float>
; CHECK-NEXT: [[R:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]		; CHECK-NEXT: ret <4 x float> [[R]]
;		;
%l = load <2 x float>, <2 x float>* %p, align 4		%l = load <2 x float>, <2 x float>* %p, align 4
%s = extractelement <2 x float> %l, i32 0		%s = extractelement <2 x float> %l, i32 0
%r = insertelement <4 x float> undef, float %s, i32 0		%r = insertelement <4 x float> undef, float %s, i32 0
ret <4 x float> %r		ret <4 x float> %r
}		}

define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {		define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(		; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
; CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x float> [[P:%.]] to <4 x float>		; CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x float> [[P:%.]] to <4 x float>
; CHECK-NEXT: [[R:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4		; CHECK-NEXT: [[TMP2:%.]] = load <4 x float>, <4 x float> [[TMP1]], align 4
		; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <4 x float> [[R]]		; CHECK-NEXT: ret <4 x float> [[R]]
;		;
%l = load <8 x float>, <8 x float>* %p, align 4		%l = load <8 x float>, <8 x float>* %p, align 4
%s = extractelement <8 x float> %l, i32 0		%s = extractelement <8 x float> %l, i32 0
%r = insertelement <4 x float> undef, float %s, i32 0		%r = insertelement <4 x float> undef, float %s, i32 0
ret <4 x float> %r		ret <4 x float> %r
}		}

Show All 31 Lines