This is an archive of the discontinued LLVM Phabricator instance.

[x86] narrow 256-bit horizontal ops via demanded elements
ClosedPublic

Authored by spatel on Feb 6 2019, 12:58 PM.

Download Raw Diff

Details

Reviewers

RKSimon
craig.topper
andreadb

Commits

rG833550fc74b9: [x86] narrow 256-bit horizontal ops via demanded elements
rL353641: [x86] narrow 256-bit horizontal ops via demanded elements

Summary

256-bit horizontal math ops are an x86 monstrosity (and thankfully have not been extended to 512-bit AFAIK).

The two 128-bit halves operate on separate halves of the inputs. So if we don't demand anything in the upper half of the result, we can extract the low halves of the inputs, do the math, and then insert that result into a 256-bit output.

All of the extract/insert is free (ymm<-->xmm), so we're left with a narrower (cheaper) version of the original op.

In the affected tests based on:
https://bugs.llvm.org/show_bug.cgi?id=33758
https://bugs.llvm.org/show_bug.cgi?id=38971
...we see that the h-op narrowing can result in further narrowing of other math via existing generic transforms.

I originally drafted this patch as an exact pattern match starting from extract_vector_elt, but I thought we might see diffs starting from extract_subvector too, so I changed it to a more general demanded elements solution. There are no extra existing regression test improvements from that switch though, so we could go back. The patch is slightly less code this way though assuming I didn't miss any constraints.

Diff Detail

Repository: rL LLVM

Event Timeline

spatel created this revision.Feb 6 2019, 12:58 PM

Herald added a subscriber: mcrosier. · View Herald TranscriptFeb 6 2019, 12:58 PM

RKSimon added inline comments.Feb 7 2019, 2:29 AM

lib/Target/X86/X86ISelLowering.cpp
32980 ↗	(On Diff #185618)	Can you use the extract128BitVector helper to do this?

Patch updated:
Use {insert/extract}128BitVector to reduce code.

LGTM with one (optional) minor.

lib/Target/X86/X86ISelLowering.cpp
32981 ↗	(On Diff #185803)	You should be able to just use Ext0.getValueType()?

This revision is now accepted and ready to land.Feb 7 2019, 10:40 AM

Closed by commit rL353641: [x86] narrow 256-bit horizontal ops via demanded elements (authored by spatel). · Explain WhyFeb 10 2019, 7:22 AM

This revision was automatically updated to reflect the committed changes.

Herald added a project: Restricted Project. · View Herald TranscriptFeb 10 2019, 7:22 AM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86ISelLowering.cpp

18 lines

test/

CodeGen/

X86/

haddsub.ll

12 lines

phaddsub-extract.ll

68 lines

Diff 186149

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 32,945 Lines • ▼ Show 20 Lines	case X86ISD::PSHUFB: {
// TODO - simplify other variable shuffle masks.		// TODO - simplify other variable shuffle masks.
SDValue Mask = Op.getOperand(1);		SDValue Mask = Op.getOperand(1);
APInt MaskUndef, MaskZero;		APInt MaskUndef, MaskZero;
if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,		if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
Depth + 1))		Depth + 1))
return true;		return true;
break;		break;
}		}
		case X86ISD::HADD:
		case X86ISD::HSUB:
		case X86ISD::FHADD:
		case X86ISD::FHSUB: {
		// 256-bit horizontal ops are two 128-bit ops glued together. If we do not
		// demand any of the high elements, then narrow the h-op to 128-bits:
		// (hop ymm0, ymm1) --> insert undef, (hop xmm0, xmm1), 0
		if (VT.is256BitVector() && DemandedElts.lshr(NumElts / 2) == 0) {
		SDLoc DL(Op);
		SDValue Ext0 = extract128BitVector(Op.getOperand(0), 0, TLO.DAG, DL);
		SDValue Ext1 = extract128BitVector(Op.getOperand(1), 0, TLO.DAG, DL);
		SDValue Hop = TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Ext1);
		SDValue UndefVec = TLO.DAG.getUNDEF(VT);
		SDValue Insert = insert128BitVector(UndefVec, Hop, 0, TLO.DAG, DL);
		return TLO.CombineTo(Op, Insert);
		}
		break;
		}
}		}

// Simplify target shuffles.		// Simplify target shuffles.
if (!isTargetShuffle(Opc) \|\| !VT.isSimple())		if (!isTargetShuffle(Opc) \|\| !VT.isSimple())
return false;		return false;

// Get target shuffle mask.		// Get target shuffle mask.
bool IsUnary;		bool IsUnary;
▲ Show 20 Lines • Show All 10,399 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/haddsub.ll

	Show First 20 Lines • Show All 1,386 Lines • ▼ Show 20 Lines
	; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]			; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
	; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX-FAST-LABEL: fadd_reduce_v8f32:			; AVX-FAST-LABEL: fadd_reduce_v8f32:
	; AVX-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0			; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
	; AVX-FAST-NEXT: vaddps %ymm0, %ymm1, %ymm0			; AVX-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0
	; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]			; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
	; AVX-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0			; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0			; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
	; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
	; AVX-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: vzeroupper
	; AVX-FAST-NEXT: retq			; AVX-FAST-NEXT: retq
	%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)			%r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
	ret float %r			ret float %r
	}			}

	define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {			define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) {
	; SSE3-SLOW-LABEL: fadd_reduce_v4f64:			; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
	Show All 18 Lines
	; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]			; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
	; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX-FAST-LABEL: fadd_reduce_v4f64:			; AVX-FAST-LABEL: fadd_reduce_v4f64:
	; AVX-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0			; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0
	; AVX-FAST-NEXT: vaddpd %ymm0, %ymm1, %ymm0			; AVX-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0
	; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0			; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
	; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
	; AVX-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: vzeroupper
	; AVX-FAST-NEXT: retq			; AVX-FAST-NEXT: retq
	%r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)			%r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
	ret double %r			ret double %r
	}			}

llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll

	Show First 20 Lines • Show All 926 Lines • ▼ Show 20 Lines
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX1-FAST-LABEL: partial_reduction_add_v8i32:			; AVX-FAST-LABEL: partial_reduction_add_v8i32:
	; AVX1-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0			; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vmovd %xmm0, %eax			; AVX-FAST-NEXT: vmovd %xmm0, %eax
	; AVX1-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: vzeroupper
	; AVX1-FAST-NEXT: retq			; AVX-FAST-NEXT: retq
	;
	; AVX2-FAST-LABEL: partial_reduction_add_v8i32:
	; AVX2-FAST: # %bb.0:
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0
	; AVX2-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
	; AVX2-FAST-NEXT: vmovd %xmm0, %eax
	; AVX2-FAST-NEXT: vzeroupper
	; AVX2-FAST-NEXT: retq
	;
	; AVX512-FAST-LABEL: partial_reduction_add_v8i32:
	; AVX512-FAST: # %bb.0:
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0
	; AVX512-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0
	; AVX512-FAST-NEXT: vmovd %xmm0, %eax
	; AVX512-FAST-NEXT: vzeroupper
	; AVX512-FAST-NEXT: retq
	%x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0213 = add <8 x i32> %x, %x23			%x0213 = add <8 x i32> %x, %x23
	%x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0123 = add <8 x i32> %x0213, %x13			%x0123 = add <8 x i32> %x0213, %x13
	%r = extractelement <8 x i32> %x0123, i32 0			%r = extractelement <8 x i32> %x0123, i32 0
	ret i32 %r			ret i32 %r
	}			}

	▲ Show 20 Lines • Show All 84 Lines • ▼ Show 20 Lines
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0			; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX-SLOW-NEXT: vmovd %xmm0, %eax			; AVX-SLOW-NEXT: vmovd %xmm0, %eax
	; AVX-SLOW-NEXT: vzeroupper			; AVX-SLOW-NEXT: vzeroupper
	; AVX-SLOW-NEXT: retq			; AVX-SLOW-NEXT: retq
	;			;
	; AVX1-FAST-LABEL: partial_reduction_sub_v8i32:			; AVX-FAST-LABEL: partial_reduction_sub_v8i32:
	; AVX1-FAST: # %bb.0:			; AVX-FAST: # %bb.0:
	; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0			; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0			; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0
	; AVX1-FAST-NEXT: vmovd %xmm0, %eax			; AVX-FAST-NEXT: vmovd %xmm0, %eax
	; AVX1-FAST-NEXT: vzeroupper			; AVX-FAST-NEXT: vzeroupper
	; AVX1-FAST-NEXT: retq			; AVX-FAST-NEXT: retq
	;
	; AVX2-FAST-LABEL: partial_reduction_sub_v8i32:
	; AVX2-FAST: # %bb.0:
	; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0
	; AVX2-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0
	; AVX2-FAST-NEXT: vmovd %xmm0, %eax
	; AVX2-FAST-NEXT: vzeroupper
	; AVX2-FAST-NEXT: retq
	;
	; AVX512-FAST-LABEL: partial_reduction_sub_v8i32:
	; AVX512-FAST: # %bb.0:
	; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX512-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0
	; AVX512-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0
	; AVX512-FAST-NEXT: vmovd %xmm0, %eax
	; AVX512-FAST-NEXT: vzeroupper
	; AVX512-FAST-NEXT: retq
	%x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0213 = sub <8 x i32> %x, %x23			%x0213 = sub <8 x i32> %x, %x23
	%x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>			%x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
	%x0123 = sub <8 x i32> %x0213, %x13			%x0123 = sub <8 x i32> %x0213, %x13
	%r = extractelement <8 x i32> %x0123, i32 0			%r = extractelement <8 x i32> %x0123, i32 0
	ret i32 %r			ret i32 %r
	}			}

	▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines