Diff 464707

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Show First 20 Lines • Show All 302 Lines • ▼ Show 20 Lines	case 64: {
// We accept bit-widths >= 64bits and elements {8,16,32,64} bits.		// We accept bit-widths >= 64bits and elements {8,16,32,64} bits.
unsigned VectorBits = NumElements.getFixedValue() * ElementBits;		unsigned VectorBits = NumElements.getFixedValue() * ElementBits;
return VectorBits >= 64;		return VectorBits >= 64;
}		}
}		}
return false;		return false;
}		}

bool isLegalNTStore(Type *DataType, Align Alignment) {		bool isLegalNTStoreLoad(Type *DataType, Align Alignment) {
// NOTE: The logic below is mostly geared towards LV, which calls it with		// NOTE: The logic below is mostly geared towards LV, which calls it with
		fhahnUnsubmitted Not Done Reply Inline Actions This should be also moved to `isLegalNTStoreLoad` fhahn: This should be also moved to `isLegalNTStoreLoad`
		zjaffalAuthorUnsubmitted Done Reply Inline Actions This line only or the whole comment block ? zjaffal: This line only or the whole comment block ?
// vectors with 2 elements. We might want to improve that, if other		// vectors with 2 elements. We might want to improve that, if other
// users show up.		// users show up.
// Nontemporal vector stores can be directly lowered to STNP, if the vector		// Nontemporal vector loads/stores can be directly lowered to LDNP/STNP, if
// can be halved so that each half fits into a register. That's the case if		// the vector can be halved so that each half fits into a register. That's
// the element type fits into a register and the number of elements is a		// the case if the element type fits into a register and the number of
// power of 2 > 1.		// elements is a power of 2 > 1.
if (auto *DataTypeVTy = dyn_cast<VectorType>(DataType)) {		if (auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType)) {
unsigned NumElements =		unsigned NumElements = DataTypeTy->getNumElements();
cast<FixedVectorType>(DataTypeVTy)->getNumElements();		unsigned EltSize = DataTypeTy->getElementType()->getScalarSizeInBits();
unsigned EltSize = DataTypeVTy->getElementType()->getScalarSizeInBits();
return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&		return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&
EltSize <= 128 && isPowerOf2_64(EltSize);		EltSize <= 128 && isPowerOf2_64(EltSize);
}		}
		fhahnUnsubmitted Not Done Reply Inline Actions This could also be folded into `isLegalNTStoreLoad`, same for `return BaseT::isLegalNTStore(DataType, Alignment);`? fhahn: This could also be folded into `isLegalNTStoreLoad`, same for ` return BaseT::isLegalNTStore…
return BaseT::isLegalNTStore(DataType, Alignment);		return BaseT::isLegalNTStore(DataType, Alignment);
}		}

		bool isLegalNTStore(Type *DataType, Align Alignment) {
		return isLegalNTStoreLoad(DataType, Alignment);
		dmgreenUnsubmitted Done Reply Inline Actions This can be `dyn_cast<FixedVectorType>`, which avoids the need for the cast<..> below. The same thing can be done above in isLegalNTStore too. dmgreen: This can be `dyn_cast<FixedVectorType>`, which avoids the need for the cast<..> below. The same…
		}

		fhahnUnsubmitted Not Done Reply Inline Actions Is there much difference between the load and store version? Could they just share the same code? I think this would also require support for generating `LDNP` for types smaller than 256 bits. @zjaffal is currently looking into this. fhahn: Is there much difference between the load and store version? Could they just share the same…
		bool isLegalNTLoad(Type *DataType, Align Alignment) {
		// Only supports little-endian targets.
		if (ST->isLittleEndian())
		return isLegalNTStoreLoad(DataType, Alignment);
		return BaseT::isLegalNTLoad(DataType, Alignment);
		}

bool enableOrderedReductions() const { return true; }		bool enableOrderedReductions() const { return true; }

InstructionCost getInterleavedMemoryOpCost(		InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,		unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,		Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond = false, bool UseMaskForGaps = false);		bool UseMaskForCond = false, bool UseMaskForGaps = false);

bool		bool
▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll

Show First 20 Lines • Show All 252 Lines • ▼ Show 20 Lines	for.body: ; preds = %entry, %for.body
br i1 %cmp, label %for.body, label %for.cond.cleanup		br i1 %cmp, label %for.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret void		ret void
}		}

define i4 @test_i4_load(i4* %ddst) {		define i4 @test_i4_load(i4* %ddst) {
; CHECK-LABEL: define i4 @test_i4_load		; CHECK-LABEL: define i4 @test_i4_load
; CHECK-LABEL: vector.body:		; CHECK-NOT: vector.body:
; CHECK: [[LOAD:%.]] = load i4, i4 {{.*}}, align 1, !nontemporal !0
; CHECk: ret i4 %{{.*}}		; CHECk: ret i4 %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi i4 [ 0, %entry ], [ %add, %for.body ]		%acc.08 = phi i4 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i4, i4* %ddst, i64 %indvars.iv		%arrayidx = getelementptr inbounds i4, i4* %ddst, i64 %indvars.iv
%l = load i4, i4* %arrayidx, align 1, !nontemporal !8		%l = load i4, i4* %arrayidx, align 1, !nontemporal !8
%add = add i4 %l, %acc.08		%add = add i4 %l, %acc.08
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 4092		%exitcond.not = icmp eq i64 %indvars.iv.next, 4092
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret i4 %add		ret i4 %add
}		}

define i8 @test_load_i8(i8* %ddst) {		define i8 @test_load_i8(i8* %ddst) {
; CHECK-LABEL: @test_load_i8(		; CHECK-LABEL: @test_load_i8(
; CHECK-NOT: vector.body:		; CHECK: vector.body:
		; CHECK: load <4 x i8>, <4 x i8>* {{.*}}, align 1, !nontemporal !0
		dmgreenUnsubmitted Not Done Reply Inline Actions Is it worth adding a quick check line for the code that is produced? `; CHECK: = load <16 x i8>, <16 x i8>* {{.}}, align 1, !nontemporal !0` so that the new load is tested. dmgreen:* Is it worth adding a quick check line for the code that is produced? `; CHECK: = load <16 x i8>…
; CHECk: ret i8 %{{.*}}		; CHECk: ret i8 %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi i8 [ 0, %entry ], [ %add, %for.body ]		%acc.08 = phi i8 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i8, i8* %ddst, i64 %indvars.iv		%arrayidx = getelementptr inbounds i8, i8* %ddst, i64 %indvars.iv
%l = load i8, i8* %arrayidx, align 1, !nontemporal !8		%l = load i8, i8* %arrayidx, align 1, !nontemporal !8
%add = add i8 %l, %acc.08		%add = add i8 %l, %acc.08
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 4092		%exitcond.not = icmp eq i64 %indvars.iv.next, 4092
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret i8 %add		ret i8 %add
}		}

define half @test_half_load(half* %ddst) {		define half @test_half_load(half* %ddst) {
; CHECK-LABEL: @test_half_load		; CHECK-LABEL: @test_half_load
; CHECK-NOT: vector.body:		; CHECK-LABEL: vector.body:
		; CHECK: load <4 x half>, <4 x half>* {{.*}}, align 2, !nontemporal !0
; CHECk: ret half %{{.*}}		; CHECk: ret half %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi half [ 0.0, %entry ], [ %add, %for.body ]		%acc.08 = phi half [ 0.0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds half, half* %ddst, i64 %indvars.iv		%arrayidx = getelementptr inbounds half, half* %ddst, i64 %indvars.iv
%l = load half, half* %arrayidx, align 2, !nontemporal !8		%l = load half, half* %arrayidx, align 2, !nontemporal !8
%add = fadd half %l, %acc.08		%add = fadd half %l, %acc.08
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 4092		%exitcond.not = icmp eq i64 %indvars.iv.next, 4092
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret half %add		ret half %add
}		}

define i16 @test_i16_load(i16* %ddst) {		define i16 @test_i16_load(i16* %ddst) {
; CHECK-LABEL: @test_i16_load		; CHECK-LABEL: @test_i16_load
; CHECK-NOT: vector.body:		; CHECK-LABEL: vector.body:
		; CHECK: load <4 x i16>, <4 x i16>* {{.*}}, align 2, !nontemporal !0
; CHECk: ret i16 %{{.*}}		; CHECk: ret i16 %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi i16 [ 0, %entry ], [ %add, %for.body ]		%acc.08 = phi i16 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i16, i16* %ddst, i64 %indvars.iv		%arrayidx = getelementptr inbounds i16, i16* %ddst, i64 %indvars.iv
%l = load i16, i16* %arrayidx, align 2, !nontemporal !8		%l = load i16, i16* %arrayidx, align 2, !nontemporal !8
%add = add i16 %l, %acc.08		%add = add i16 %l, %acc.08
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 4092		%exitcond.not = icmp eq i64 %indvars.iv.next, 4092
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret i16 %add		ret i16 %add
}		}

define i32 @test_i32_load(i32* %ddst) {		define i32 @test_i32_load(i32* %ddst) {
; CHECK-LABEL: @test_i32_load		; CHECK-LABEL: @test_i32_load
; CHECK-NOT: vector.body:		; CHECK-LABEL: vector.body:
		; CHECK: load <4 x i32>, <4 x i32>* {{.*}}, align 4, !nontemporal !0
; CHECk: ret i32 %{{.*}}		; CHECk: ret i32 %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi i32 [ 0, %entry ], [ %add, %for.body ]		%acc.08 = phi i32 [ 0, %entry ], [ %add, %for.body ]
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	for.body: ; preds = %entry, %for.body
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret i40 %add		ret i40 %add
}		}

define i64 @test_i64_load(i64* %ddst) {		define i64 @test_i64_load(i64* %ddst) {
; CHECK-LABEL: @test_i64_load		; CHECK-LABEL: @test_i64_load
; CHECK-NOT: vector.body:		; CHECK-LABEL: vector.body:
		; CHECK: load <4 x i64>, <4 x i64>* {{.*}}, align 4, !nontemporal !0
; CHECk: ret i64 %{{.*}}		; CHECk: ret i64 %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi i64 [ 0, %entry ], [ %add, %for.body ]		%acc.08 = phi i64 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds i64, i64* %ddst, i64 %indvars.iv		%arrayidx = getelementptr inbounds i64, i64* %ddst, i64 %indvars.iv
%l = load i64, i64* %arrayidx, align 4, !nontemporal !8		%l = load i64, i64* %arrayidx, align 4, !nontemporal !8
%add = add i64 %l, %acc.08		%add = add i64 %l, %acc.08
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 4092		%exitcond.not = icmp eq i64 %indvars.iv.next, 4092
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret i64 %add		ret i64 %add
}		}

define double @test_double_load(double* %ddst) {		define double @test_double_load(double* %ddst) {
; CHECK-LABEL: @test_double_load		; CHECK-LABEL: @test_double_load
; CHECK-NOT: vector.body:		; CHECK-LABEL: vector.body:
		; CHECK: load <4 x double>, <4 x double>* {{.*}}, align 4, !nontemporal !0
; CHECk: ret double %{{.*}}		; CHECk: ret double %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi double [ 0.0, %entry ], [ %add, %for.body ]		%acc.08 = phi double [ 0.0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds double, double* %ddst, i64 %indvars.iv		%arrayidx = getelementptr inbounds double, double* %ddst, i64 %indvars.iv
%l = load double, double* %arrayidx, align 4, !nontemporal !8		%l = load double, double* %arrayidx, align 4, !nontemporal !8
%add = fadd double %l, %acc.08		%add = fadd double %l, %acc.08
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1		%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, 4092		%exitcond.not = icmp eq i64 %indvars.iv.next, 4092
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body		br i1 %exitcond.not, label %for.cond.cleanup, label %for.body

for.cond.cleanup: ; preds = %for.body		for.cond.cleanup: ; preds = %for.body
ret double %add		ret double %add
}		}

define i128 @test_i128_load(i128* %ddst) {		define i128 @test_i128_load(i128* %ddst) {
; CHECK-LABEL: @test_i128_load		; CHECK-LABEL: @test_i128_load
; CHECK-NOT: vector.body:		; CHECK-LABEL: vector.body:
		; CHECK: load <4 x i128>, <4 x i128>* {{.*}}, align 4, !nontemporal !0
; CHECk: ret i128 %{{.*}}		; CHECk: ret i128 %{{.*}}
;		;
entry:		entry:
br label %for.body		br label %for.body

for.body: ; preds = %entry, %for.body		for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]		%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%acc.08 = phi i128 [ 0, %entry ], [ %add, %for.body ]		%acc.08 = phi i128 [ 0, %entry ], [ %add, %for.body ]
Show All 34 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Add support to loop vectorization for non temporal loads
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 464707

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Add support to loop vectorization for non temporal loadsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 464707

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

llvm/test/Transforms/LoopVectorize/AArch64/nontemporal-load-store.ll

[AArch64] Add support to loop vectorization for non temporal loads
ClosedPublic