Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2336,9 +2336,10 @@ } /// Return true if it's significantly cheaper to shift a vector by a uniform - /// scalar than by an amount which will vary across each lane. On x86, for - /// example, there is a "psllw" instruction for the former case, but no simple - /// instruction for a general "a << b" operation on vectors. + /// scalar than by an amount which will vary across each lane. On x86 before + /// AVX2 for example, there is a "psllw" instruction for the former case, but + /// no simple instruction for a general "a << b" operation on vectors. + /// This should also apply to lowering for vector funnel shifts (rotates). virtual bool isVectorShiftByScalarCheap(Type *Ty) const { return false; } Index: llvm/lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- llvm/lib/CodeGen/CodeGenPrepare.cpp +++ llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6434,7 +6434,8 @@ bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) { BasicBlock *DefBB = SVI->getParent(); - // Only do this xform if variable vector shifts are particularly expensive. + // Only do this transform if variable vector shifts are expensive. + // We are also using shift cost as a proxy for rotate and funnel shift ops. if (!TLI->isVectorShiftByScalarCheap(SVI->getType())) return false; @@ -6454,8 +6455,13 @@ BasicBlock *UserBB = UI->getParent(); if (UserBB == DefBB) continue; - // For now only apply this when the splat is used by a shift instruction. - if (!UI->isShift()) continue; + // Check if the splat is used by a shift or funnel-shift instruction. + if (!UI->isShift()) { + auto *II = dyn_cast(UI); + if (!II || (II->getIntrinsicID() != Intrinsic::fshl && + II->getIntrinsicID() != Intrinsic::fshr)) + continue; + } // Everything checks out, sink the shuffle if the user's block doesn't // already have a copy. Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1028,6 +1028,8 @@ int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + /// This is used to enable splatted operand transforms for vector shifts + /// and vector funnel shifts. bool isVectorShiftByScalarCheap(Type *Ty) const override; /// Add x86-specific opcodes to the default list. Index: llvm/test/CodeGen/X86/vector-fshl-128.ll =================================================================== --- llvm/test/CodeGen/X86/vector-fshl-128.ll +++ llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -2143,27 +2143,21 @@ ; SSE2-LABEL: sink_splatvar: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pslld $23, %xmm0 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: andl $31, %ecx +; SSE2-NEXT: movl $32, %edx +; SSE2-NEXT: subl %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB8_1: # %loop ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psrld %xmm0, %xmm3 +; SSE2-NEXT: pslld %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 ; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax) ; SSE2-NEXT: addq $16, %rax ; SSE2-NEXT: jne .LBB8_1 @@ -2173,26 +2167,22 @@ ; SSE41-LABEL: sink_splatvar: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movd %esi, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pslld $23, %xmm0 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32] +; SSE41-NEXT: psubd %xmm1, %xmm0 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: .p2align 4, 0x90 ; SSE41-NEXT: .LBB8_1: # %loop ; SSE41-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE41-NEXT: pmuludq %xmm1, %xmm3 -; SSE41-NEXT: pmuludq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: movdqu %xmm3, 1024(%rdi,%rax) +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld %xmm0, %xmm3 +; SSE41-NEXT: pslld %xmm1, %xmm2 +; SSE41-NEXT: por %xmm3, %xmm2 +; SSE41-NEXT: movdqu %xmm2, 1024(%rdi,%rax) ; SSE41-NEXT: addq $16, %rax ; SSE41-NEXT: jne .LBB8_1 ; SSE41-NEXT: # %bb.2: # %end @@ -2201,25 +2191,20 @@ ; AVX1-LABEL: sink_splatvar: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpslld $23, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [32,32,32,32] +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB8_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax) ; AVX1-NEXT: addq $16, %rax ; AVX1-NEXT: jne .LBB8_1 @@ -2380,29 +2365,23 @@ ; X32-SSE-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE-NEXT: .cfi_offset %esi, -8 ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-SSE-NEXT: xorl %ecx, %ecx -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pslld $23, %xmm0 -; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movd %xmm0, %edx +; X32-SSE-NEXT: andl $31, %edx +; X32-SSE-NEXT: movl $32, %esi +; X32-SSE-NEXT: subl %edx, %esi +; X32-SSE-NEXT: movd %esi, %xmm0 +; X32-SSE-NEXT: movd %edx, %xmm1 ; X32-SSE-NEXT: xorl %edx, %edx ; X32-SSE-NEXT: .p2align 4, 0x90 ; X32-SSE-NEXT: .LBB8_1: # %loop ; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1 ; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X32-SSE-NEXT: por %xmm4, %xmm2 +; X32-SSE-NEXT: movdqa %xmm2, %xmm3 +; X32-SSE-NEXT: psrld %xmm0, %xmm3 +; X32-SSE-NEXT: pslld %xmm1, %xmm2 +; X32-SSE-NEXT: por %xmm3, %xmm2 ; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4) ; X32-SSE-NEXT: addl $4, %ecx ; X32-SSE-NEXT: adcl $0, %edx Index: llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll =================================================================== --- llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll +++ llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll @@ -180,23 +180,59 @@ } define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) { -; CHECK-LABEL: @funnel_splatvar( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]]) -; CHECK-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 -; CHECK-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void +; CHECK-SSE2-LABEL: @funnel_splatvar( +; CHECK-SSE2-NEXT: entry: +; CHECK-SSE2-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0 +; CHECK-SSE2-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-SSE2: vector.body: +; CHECK-SSE2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-SSE2-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-SSE2-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]] +; CHECK-SSE2-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>* +; CHECK-SSE2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4 +; CHECK-SSE2-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]]) +; CHECK-SSE2-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4 +; CHECK-SSE2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-SSE2-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 +; CHECK-SSE2-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK-SSE2: for.cond.cleanup: +; CHECK-SSE2-NEXT: ret void +; +; CHECK-XOP-LABEL: @funnel_splatvar( +; CHECK-XOP-NEXT: entry: +; CHECK-XOP-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0 +; CHECK-XOP-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-XOP-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-XOP: vector.body: +; CHECK-XOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-XOP-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]] +; CHECK-XOP-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>* +; CHECK-XOP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4 +; CHECK-XOP-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]]) +; CHECK-XOP-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4 +; CHECK-XOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-XOP-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 +; CHECK-XOP-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK-XOP: for.cond.cleanup: +; CHECK-XOP-NEXT: ret void +; +; CHECK-AVX-LABEL: @funnel_splatvar( +; CHECK-AVX-NEXT: entry: +; CHECK-AVX-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0 +; CHECK-AVX-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-AVX: vector.body: +; CHECK-AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-AVX-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]] +; CHECK-AVX-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>* +; CHECK-AVX-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4 +; CHECK-AVX-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]]) +; CHECK-AVX-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4 +; CHECK-AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-AVX-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 +; CHECK-AVX-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK-AVX: for.cond.cleanup: +; CHECK-AVX-NEXT: ret void ; entry: %broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0