This is an archive of the discontinued LLVM Phabricator instance.

[DAGComb] Do not turn insert_elt into shuffle for single elt vectors.
ClosedPublic

Authored by fhahn on May 28 2020, 4:05 AM.

Download Raw Diff

Details

Reviewers

spatel
efriedma
dmgreen
RKSimon

Commits

rGea3e2201ff91: [DAGComb] Do not turn insert_elt into shuffle for single elt vectors. (#1287)
rGd20a3d35e187: [DAGComb] Do not turn insert_elt into shuffle for single elt vectors.

Summary

Currently combineInsertEltToShuffle turns insert_vector_elt into a
vector_shuffle, even if the inserted element is a vector with a single
element. In this case, it should be unlikely that the additional shuffle
would be more efficient than a insert_vector_elt.

Additionally, this fixes a infinite cycle in DAGCombine, where
combineInsertEltToShuffle turns a insert_vector_elt into a shuffle,
which gets turned back into a insert_vector_elt/extract_vector_elt by
a custom AArch64 lowering (in visitVECTOR_SHUFFLE).

Such insert_vector_elt and extract_vector_elt combinations can be
lowered efficiently using mov on AArch64.

There are 2 test changes in arm64-neon-copy.ll: we now use one or two
mov instructions instead of a single zip1. The reason that we need a
second mov in ins1f2 is that we have to move the result to the result
register and is not really related to the DAGCombine fold I think.
But in any case, on most uarchs, mov should be cheaper than zip1. On a
Cortex-A75 for example, zip1 is twice as expensive as mov
(https://developer.arm.com/docs/101398/latest/arm-cortex-a75-software-optimization-guide-v20)

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

fhahn created this revision.May 28 2020, 4:05 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 28 2020, 4:05 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

LGTM

This revision is now accepted and ready to land.May 28 2020, 4:27 AM

Harbormaster failed remote builds in B58193: Diff 266808!May 28 2020, 5:57 AM

Closed by commit rGd20a3d35e187: [DAGComb] Do not turn insert_elt into shuffle for single elt vectors. (authored by fhahn). · Explain WhyMay 29 2020, 5:24 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

4 lines

test/

CodeGen/

AArch64/

arm64-neon-copy.ll

5 lines

vector-insert-shuffle-cycle.ll

35 lines

Diff 267192

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 17,131 Lines • ▼ Show 20 Lines	if (InsertVal.getOpcode() != ISD::BITCAST \|\| !InsertVal.hasOneUse() \|\|
!InsertVal.getOperand(0).getValueType().isVector())		!InsertVal.getOperand(0).getValueType().isVector())
return SDValue();		return SDValue();

SDValue SubVec = InsertVal.getOperand(0);		SDValue SubVec = InsertVal.getOperand(0);
SDValue DestVec = N->getOperand(0);		SDValue DestVec = N->getOperand(0);
EVT SubVecVT = SubVec.getValueType();		EVT SubVecVT = SubVec.getValueType();
EVT VT = DestVec.getValueType();		EVT VT = DestVec.getValueType();
unsigned NumSrcElts = SubVecVT.getVectorNumElements();		unsigned NumSrcElts = SubVecVT.getVectorNumElements();
		// If the source only has a single vector element, the cost of creating adding
		// it to a vector is likely to exceed the cost of a insert_vector_elt.
		if (NumSrcElts == 1)
		return SDValue();
unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();		unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
unsigned NumMaskVals = ExtendRatio * NumSrcElts;		unsigned NumMaskVals = ExtendRatio * NumSrcElts;

// Step 1: Create a shuffle mask that implements this insert operation. The		// Step 1: Create a shuffle mask that implements this insert operation. The
// vector that we are inserting into will be operand 0 of the shuffle, so		// vector that we are inserting into will be operand 0 of the shuffle, so
// those elements are just 'i'. The inserted subvector is in the first		// those elements are just 'i'. The inserted subvector is in the first
// positions of operand 1 of the shuffle. Example:		// positions of operand 1 of the shuffle. Example:
// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}		// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
▲ Show 20 Lines • Show All 4,697 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-neon-copy.ll

Show First 20 Lines • Show All 194 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1		%tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
ret <4 x float> %tmp4		ret <4 x float> %tmp4
}		}

define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {		define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
; CHECK-LABEL: ins1f2:		; CHECK-LABEL: ins1f2:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0		; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: zip1 v0.2d, v1.2d, v0.2d		; CHECK-NEXT: mov v1.d[1], v0.d[0]
		; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%tmp3 = extractelement <1 x double> %tmp1, i32 0		%tmp3 = extractelement <1 x double> %tmp1, i32 0
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1		%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
ret <2 x double> %tmp4		ret <2 x double> %tmp4
}		}

define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1) {		define <2 x double> @ins1f2_args_flipped(<2 x double> %tmp2, <1 x double> %tmp1) {
; CHECK-LABEL: ins1f2_args_flipped:		; CHECK-LABEL: ins1f2_args_flipped:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1		; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d		; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%tmp3 = extractelement <1 x double> %tmp1, i32 0		%tmp3 = extractelement <1 x double> %tmp1, i32 0
%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1		%tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
ret <2 x double> %tmp4		ret <2 x double> %tmp4
}		}

define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {		define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
; CHECK-LABEL: ins16b8:		; CHECK-LABEL: ins16b8:
▲ Show 20 Lines • Show All 1,688 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc %s -o - \| FileCheck %s

				target triple = "arm64-apple-ios13.4.0"

				; Make we do not get stuck in a cycle in DAGCombiner.

				define void @test(i1 %c, <1 x double>* %ptr) {
				; CHECK-LABEL: test:
				; CHECK: ; %bb.0: ; %entry
				; CHECK-NEXT: movi d0, #0000000000000000
				; CHECK-NEXT: tbz w0, #0, LBB0_2
				; CHECK-NEXT: ; %bb.1: ; %bb1
				; CHECK-NEXT: ldr d0, [x1]
				; CHECK-NEXT: LBB0_2: ; %bb2
				; CHECK-NEXT: ldr q1, [x8]
				; CHECK-NEXT: mov.d v1[0], v0[0]
				; CHECK-NEXT: str q1, [x8]
				; CHECK-NEXT: ret
				entry:
				br i1 %c, label %bb1, label %bb2

				bb1:
				%lv1 = load <1 x double>, <1 x double>* %ptr, align 16
				br label %bb2

				bb2:
				%p = phi <1 x double> [ %lv1, %bb1 ], [ zeroinitializer, %entry ]
				%vecext19 = extractelement <1 x double> %p, i32 0
				%arrayidx21 = getelementptr inbounds [4 x <4 x double>], [4 x <4 x double>]* undef, i64 0, i64 3
				%lv2 = load <4 x double>, <4 x double>* %arrayidx21, align 16
				%vecins22 = insertelement <4 x double> %lv2, double %vecext19, i32 2
				store <4 x double> %vecins22, <4 x double>* %arrayidx21, align 16
				ret void
				}