This is an archive of the discontinued LLVM Phabricator instance.

[x86, AVX] replace masked load with full vector load when possible
ClosedPublic

Authored by spatel on Mar 11 2016, 12:00 PM.

Download Raw Diff

Details

Reviewers

RKSimon
delena
ashutosh.nema

Commits

rG62d707c8d91a: [x86, AVX] replace masked load with full vector load when possible
rL263446: [x86, AVX] replace masked load with full vector load when possible

Summary

This patch shows the benefit of converting masked vector loads to regular vector loads for x86 AVX.
I've raised the legality issue of reading the extra memory bytes on llvm-dev.

x86 already does this kind of optimization for multiple scalar loads -> vector load.
If other targets have the same flexibility, we could move this transform up to CGP or DAGCombiner.

Diff Detail

Event Timeline

spatel updated this revision to Diff 50458.Mar 11 2016, 12:00 PM

spatel retitled this revision from to [x86, AVX] replace masked load with full vector load when possible.

spatel updated this object.

spatel added reviewers: ashutosh.nema, RKSimon, delena.

spatel added a subscriber: llvm-commits.

Herald added a subscriber: mcrosier. · View Herald TranscriptMar 11 2016, 12:00 PM

LGTM with one minor comment

lib/Target/X86/X86ISelLowering.cpp
27371	if (ML->getSrc0().isUndef())

This revision is now accepted and ready to land.Mar 13 2016, 1:57 PM

Thanks Sanjay for working on this.

Changes looks OK to me, just a minor comment.

lib/Target/X86/X86ISelLowering.cpp
27342	Please retain and add comments here.

spatel added inline comments.Mar 14 2016, 9:22 AM

lib/Target/X86/X86ISelLowering.cpp
27342	I moved the comment down to line 27365 because it wasn't function-level anymore. Please let me know if you have something else in mind. Thanks!

spatel marked an inline comment as done.Mar 14 2016, 9:25 AM

spatel added inline comments.

lib/Target/X86/X86ISelLowering.cpp
27371	Good catch. I cut and pasted that. Looks like a mass edit is needed - a grep shows 194 of these scattered around the code base.

Closed by commit rL263446: [x86, AVX] replace masked load with full vector load when possible (authored by spatel). · Explain WhyMar 14 2016, 9:59 AM

This revision was automatically updated to reflect the committed changes.

spatel marked an inline comment as done.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

32 lines

test/

CodeGen/

X86/

masked_memop.ll

31 lines

Diff 50458

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 27,333 Lines • ▼ Show 20 Lines	SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
ML->isNonTemporal(), ML->isInvariant(), Alignment);		ML->isNonTemporal(), ML->isInvariant(), Alignment);

// Insert the loaded element into the appropriate place in the vector.		// Insert the loaded element into the appropriate place in the vector.
SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),		SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
Load, VecIndex);		Load, VecIndex);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);		return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
}		}

/// Convert a masked load with a constant mask into a masked load and a select.
/// This allows the select operation to use a faster kind of shuffle instruction
/// (for example, vblendvps -> vblendps).
static SDValue		static SDValue
		ashutosh.nemaUnsubmitted Not Done Reply Inline Actions Please retain and add comments here. ashutosh.nema: Please retain and add comments here.
		spatelAuthorUnsubmitted Not Done Reply Inline Actions I moved the comment down to line 27365 because it wasn't function-level anymore. Please let me know if you have something else in mind. Thanks! spatel: I moved the comment down to line 27365 because it wasn't function-level anymore. Please let me…
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,		combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {		TargetLowering::DAGCombinerInfo &DCI) {
		if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
		return SDValue();

		SDLoc DL(ML);
		EVT VT = ML->getValueType(0);

		// If we are loading the first and last elements of a vector, it is safe and
		// always faster to load the whole vector. Replace the masked load with a
		// vector load and select.
		unsigned NumElts = VT.getVectorNumElements();
		BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
		bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
		bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
		if (LoadFirstElt && LoadLastElt) {
		SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
		ML->getMemOperand());
		SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
		return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
		}

		// Convert a masked load with a constant mask into a masked load and a select.
		// This allows the select operation to use a faster kind of select instruction
		// (for example, vblendvps -> vblendps).

// Don't try this if the pass-through operand is already undefined. That would		// Don't try this if the pass-through operand is already undefined. That would
// cause an infinite loop because that's what we're about to create.		// cause an infinite loop because that's what we're about to create.
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()) \|\|		if (ML->getSrc0().getOpcode() == ISD::UNDEF)
		RKSimonUnsubmitted Done Reply Inline Actions if (ML->getSrc0().isUndef()) RKSimon: if (ML->getSrc0().isUndef())
		spatelAuthorUnsubmitted Not Done Reply Inline Actions Good catch. I cut and pasted that. Looks like a mass edit is needed - a grep shows 194 of these scattered around the code base. spatel: Good catch. I cut and pasted that. Looks like a mass edit is needed - a grep shows 194 of these…
ML->getSrc0().getOpcode() == ISD::UNDEF)
return SDValue();		return SDValue();

// The new masked load has an undef pass-through operand. The select uses the		// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.		// original pass-through operand.
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),		SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
ML->getMask(), DAG.getUNDEF(VT),		ML->getMask(), DAG.getUNDEF(VT),
ML->getMemoryVT(), ML->getMemOperand(),		ML->getMemoryVT(), ML->getMemOperand(),
ML->getExtensionType());		ML->getExtensionType());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());		SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);		return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
}		}
▲ Show 20 Lines • Show All 2,741 Lines • Show Last 20 Lines

test/CodeGen/X86/masked_memop.ll

Show First 20 Lines • Show All 909 Lines • ▼ Show 20 Lines	; SKX-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer		%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)		%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef)
ret <2 x float> %res		ret <2 x float> %res
}		}

define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {		define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
; AVX-LABEL: load_all:		; AVX-LABEL: load_all:
; AVX: ## BB#0:		; AVX: ## BB#0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0		; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
;		;
; AVX512F-LABEL: load_all:		; AVX512F-LABEL: load_all:
; AVX512F: ## BB#0:		; AVX512F: ## BB#0:
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0		; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0		; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
Show All 9 Lines

;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.		;;; Loads with Constant Masks - these should be optimized to use something other than a variable blend.

; 128-bit FP vectors are supported with AVX.		; 128-bit FP vectors are supported with AVX.

define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {		define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
; AVX-LABEL: mload_constmask_v4f32:		; AVX-LABEL: mload_constmask_v4f32:
; AVX: ## BB#0:		; AVX: ## BB#0:
; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]		; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
; AVX-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX-NEXT: retq		; AVX-NEXT: retq
;		;
; AVX512F-LABEL: mload_constmask_v4f32:		; AVX512F-LABEL: mload_constmask_v4f32:
; AVX512F: ## BB#0:		; AVX512F: ## BB#0:
; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]		; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]
; AVX512F-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2		; AVX512F-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2
; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0		; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
▲ Show 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	; SKX-NEXT: retq
ret <4 x double> %res		ret <4 x double> %res
}		}

; 256-bit integer vectors are supported with AVX2.		; 256-bit integer vectors are supported with AVX2.

define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {		define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
; AVX1-LABEL: mload_constmask_v8i32:		; AVX1-LABEL: mload_constmask_v8i32:
; AVX1: ## BB#0:		; AVX1: ## BB#0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,4294967295]		; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
; AVX1-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: mload_constmask_v8i32:		; AVX2-LABEL: mload_constmask_v8i32:
; AVX2: ## BB#0:		; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,4294967295]		; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
; AVX2-NEXT: vpmaskmovd (%rdi), %ymm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512F-LABEL: mload_constmask_v8i32:		; AVX512F-LABEL: mload_constmask_v8i32:
; AVX512F: ## BB#0:		; AVX512F: ## BB#0:
; AVX512F-NEXT: movw $135, %ax		; AVX512F-NEXT: movw $135, %ax
; AVX512F-NEXT: kmovw %eax, %k1		; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}		; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
; SKX-LABEL: mload_constmask_v8i32:		; SKX-LABEL: mload_constmask_v8i32:
; SKX: ## BB#0:		; SKX: ## BB#0:
; SKX-NEXT: movb $-121, %al		; SKX-NEXT: movb $-121, %al
; SKX-NEXT: kmovw %eax, %k1		; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}		; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
; SKX-NEXT: retq		; SKX-NEXT: retq
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)		%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
ret <8 x i32> %res		ret <8 x i32> %res
}		}

define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {		define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
; AVX1-LABEL: mload_constmask_v4i64:		; AVX1-LABEL: mload_constmask_v4i64:
; AVX1: ## BB#0:		; AVX1: ## BB#0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]		; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = mem[0],ymm0[1,2],mem[3]
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3]
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: mload_constmask_v4i64:		; AVX2-LABEL: mload_constmask_v4i64:
; AVX2: ## BB#0:		; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]		; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512F-LABEL: mload_constmask_v4i64:		; AVX512F-LABEL: mload_constmask_v4i64:
; AVX512F: ## BB#0:		; AVX512F: ## BB#0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]		; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]
; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm2		; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm2
; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0		; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq		; AVX512F-NEXT: retq
;		;
; SKX-LABEL: mload_constmask_v4i64:		; SKX-LABEL: mload_constmask_v4i64:
; SKX: ## BB#0:		; SKX: ## BB#0:
; SKX-NEXT: movb $9, %al		; SKX-NEXT: movb $9, %al
; SKX-NEXT: kmovw %eax, %k1		; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}		; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
; SKX-NEXT: retq		; SKX-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)		%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
ret <4 x i64> %res		ret <4 x i64> %res
}		}

; 512-bit FP vectors are supported with AVX512.		; 512-bit FP vectors are supported with AVX512.

define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {		define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
; AVX-LABEL: mload_constmask_v8f64:		; AVX-LABEL: mload_constmask_v8f64:
; AVX: ## BB#0:		; AVX: ## BB#0:
; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,0]		; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],mem[3]
; AVX-NEXT: vmaskmovpd (%rdi), %ymm2, %ymm2		; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,0,18446744073709551615]
; AVX-NEXT: vmaskmovpd 32(%rdi), %ymm2, %ymm2
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3]
; AVX-NEXT: retq		; AVX-NEXT: retq
;		;
; AVX512-LABEL: mload_constmask_v8f64:		; AVX512-LABEL: mload_constmask_v8f64:
; AVX512: ## BB#0:		; AVX512: ## BB#0:
; AVX512-NEXT: movb $-121, %al		; AVX512-NEXT: movb $-121, %al
; AVX512-NEXT: kmovw %eax, %k1		; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vmovupd (%rdi), %zmm0 {%k1}		; AVX512-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512-NEXT: retq		; AVX512-NEXT: retq
▲ Show 20 Lines • Show All 1,234 Lines • Show Last 20 Lines