Index: a.c =================================================================== --- /dev/null +++ a.c @@ -0,0 +1,46 @@ +#include + +typedef float __attribute__((__vector_size__(16))) vec_float4; +typedef float __attribute__((__vector_size__(8))) vec_float2; +typedef const vec_float4 vec_float4_arg; +typedef const vec_float2 vec_float2_arg; + +vec_float2 add2f(vec_float2_arg a, vec_float2_arg b) { + return a + b; // good +} + +vec_float4 add2f_0(vec_float2_arg a0, vec_float2_arg b0, vec_float2_arg a1, + vec_float2_arg b1) { + union { + vec_float2 f2[0]; + vec_float4 f4; + } a, b; + + a.f2[0] = a0; + a.f2[1] = a1; + b.f2[0] = b0; + b.f2[1] = b1; + + return a.f4 + b.f4; // bad - to/from gpr +} + +vec_float4 add2f_1(vec_float2_arg a0, vec_float2_arg b0, vec_float2_arg a1, + vec_float2_arg b1) { + union { + vec_float2 f2[0]; + vec_float4 f4; + } c; + + c.f2[0] = a0 + b0; + c.f2[1] = a1 + b1; + return c + .f4; // bad (ish) - workaround for add2f_0 but requires 2 adds + 1 shuffle +} + +vec_float4 add2f_2(vec_float2_arg a0, vec_float2_arg b0, vec_float2_arg a1, + vec_float2_arg b1) { + vec_float4 a = __builtin_shufflevector(a0, a1, 0, 1, 2, 3); + vec_float4 b = __builtin_shufflevector(b0, b1, 0, 1, 2, 3); + return a + b; // good - but only because we manipulate shuffles directly - 1 + // add + 2 shuffles +} Index: add-packed.ll =================================================================== --- /dev/null +++ add-packed.ll @@ -0,0 +1,72 @@ +; ModuleID = 'a.c' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; Function Attrs: nounwind readnone ssp uwtable +define double @add2f(double %a.coerce, double %b.coerce) #0 { + %1 = bitcast double %a.coerce to <2 x float> + %2 = bitcast double %b.coerce to <2 x float> + %3 = fadd <2 x float> %1, %2 + %4 = bitcast <2 x float> %3 to double + ret double %4 +} + +define <4 x float> @add2f_042(<2 x float> %a0.coerce, <2 x float> %a1.coerce, <4 x float> %b1.coerce) #0 { + %1 = bitcast <2 x float> %a0.coerce to <2 x float> + %2 = bitcast <2 x float> %a1.coerce to <2 x float> + %3 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> + %4 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> + %5 = select <4 x i1> , <4 x float> %4, <4 x float> %3 + %6 = shufflevector <4 x float> %b1.coerce, <4 x float> undef, <4 x i32> + %7 = select <4 x i1> , <4 x float> %6, <4 x float> %3 + %8 = fadd <4 x float> %5, %7 + ret <4 x float> %8 +} + +;; Function Attrs: nounwind readnone ssp uwtable +;define <4 x float> @add2f_0(<2 x float> %a0.coerce, <2 x float> %b0.coerce, <2 x float> %a1.coerce, <2 x float> %b1.coerce) #0 { +; %1 = bitcast <2 x float> %a0.coerce to <2 x float> +; %2 = bitcast <2 x float> %b0.coerce to <2 x float> +; %3 = bitcast <2 x float> %a1.coerce to <2 x float> +; %4 = bitcast <2 x float> %b1.coerce to <2 x float> +; %5 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> +; %6 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> +; %7 = select <4 x i1> , <4 x float> %6, <4 x float> %5 +; %8 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> +; %9 = shufflevector <2 x float> %4, <2 x float> undef, <4 x i32> +; %10 = select <4 x i1> , <4 x float> %9, <4 x float> %8 +; %11 = fadd <4 x float> %7, %10 +; ret <4 x float> %11 +;} +; +;; Function Attrs: nounwind readnone ssp uwtable +;define <4 x float> @add2f_1(double %a0.coerce, double %b0.coerce, double %a1.coerce, double %b1.coerce) #0 { +; %1 = bitcast double %a0.coerce to <2 x float> +; %2 = bitcast double %b0.coerce to <2 x float> +; %3 = bitcast double %a1.coerce to <2 x float> +; %4 = bitcast double %b1.coerce to <2 x float> +; %5 = fadd <2 x float> %1, %2 +; %6 = shufflevector <2 x float> %5, <2 x float> undef, <4 x i32> +; %7 = fadd <2 x float> %3, %4 +; %8 = shufflevector <2 x float> %7, <2 x float> undef, <4 x i32> +; %9 = select <4 x i1> , <4 x float> %8, <4 x float> %6 +; ret <4 x float> %9 +;} +; +;; Function Attrs: nounwind readnone ssp uwtable +;define <4 x float> @add2f_2(double %a0.coerce, double %b0.coerce, double %a1.coerce, double %b1.coerce) #0 { +; %1 = bitcast double %a0.coerce to <2 x float> +; %2 = bitcast double %b0.coerce to <2 x float> +; %3 = bitcast double %a1.coerce to <2 x float> +; %4 = bitcast double %b1.coerce to <2 x float> +; %5 = shufflevector <2 x float> %1, <2 x float> %3, <4 x i32> +; %6 = shufflevector <2 x float> %2, <2 x float> %4, <4 x i32> +; %7 = fadd <4 x float> %5, %6 +; ret <4 x float> %7 +;} +; +attributes #0 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.5 "} Index: insertps.after.s =================================================================== --- /dev/null +++ insertps.after.s @@ -0,0 +1,49 @@ + .section __TEXT,__text,regular,pure_instructions + .macosx_version_min 10, 9 + .globl _insertWa + .align 4, 0x90 +_insertWa: ## @insertWa + .cfi_startproc +## BB#0: + vinsertps $48, 192(%rdi), %xmm0, %xmm0 + retq + .cfi_endproc + + .globl _insertWb + .align 4, 0x90 +_insertWb: ## @insertWb + .cfi_startproc +## BB#0: + vinsertps $48, (%rdi), %xmm0, %xmm0 + retq + .cfi_endproc + + .globl _insertWc + .align 4, 0x90 +_insertWc: ## @insertWc + .cfi_startproc +## BB#0: + vinsertps $48, 12(%rdi), %xmm0, %xmm0 + retq + .cfi_endproc + + .globl _insertWd + .align 4, 0x90 +_insertWd: ## @insertWd + .cfi_startproc +## BB#0: + vinsertps $48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0] + retq + .cfi_endproc + + .globl _insertWe + .align 4, 0x90 +_insertWe: ## @insertWe + .cfi_startproc +## BB#0: + vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 + retq + .cfi_endproc + + +.subsections_via_symbols Index: insertps.before.s =================================================================== --- /dev/null +++ insertps.before.s @@ -0,0 +1,51 @@ + .section __TEXT,__text,regular,pure_instructions + .macosx_version_min 10, 9 + .globl _insertWa + .align 4, 0x90 +_insertWa: ## @insertWa + .cfi_startproc +## BB#0: + vmovaps (%rdi), %xmm1 + vinsertps $48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0] + retq + .cfi_endproc + + .globl _insertWb + .align 4, 0x90 +_insertWb: ## @insertWb + .cfi_startproc +## BB#0: + vinsertps $48, (%rdi), %xmm0, %xmm0 + retq + .cfi_endproc + + .globl _insertWc + .align 4, 0x90 +_insertWc: ## @insertWc + .cfi_startproc +## BB#0: + vinsertps $48, 12(%rdi), %xmm0, %xmm0 + retq + .cfi_endproc + + .globl _insertWd + .align 4, 0x90 +_insertWd: ## @insertWd + .cfi_startproc +## BB#0: + vinsertps $48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0] + retq + .cfi_endproc + + .globl _insertWe + .align 4, 0x90 +_insertWe: ## @insertWe + .cfi_startproc +## BB#0: + vbroadcastss (%rdi,%rsi,4), %xmm1 + vinsertps $48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0] + retq + .cfi_endproc + + +.subsections_via_symbols Index: insertps.ll =================================================================== --- /dev/null +++ insertps.ll @@ -0,0 +1,77 @@ +; ModuleID = '/Users/filipe/work/tests/pcg/31474.c' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +; Function Attrs: nounwind readonly ssp uwtable +define <4 x float> @insertWa(<4 x float> %a, <4 x float>* nocapture readonly %pb) #0 { + %1 = load <4 x float>* %pb, align 16, !tbaa !1 + %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) + ret <4 x float> %2 +} + +; Function Attrs: nounwind readonly ssp uwtable +define <4 x float> @insertWe(<4 x float> %a, float* nocapture readonly %fb, i64 %index) #0 { + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4, !tbaa !4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) #1 + +; Function Attrs: nounwind readonly ssp uwtable +define <4 x float> @insertWb(<4 x float> %a, float* nocapture readonly %pb) #0 { + %1 = extractelement <4 x float> %a, i32 0 + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = insertelement <4 x float> %2, float %3, i32 1 + %5 = extractelement <4 x float> %a, i32 2 + %6 = insertelement <4 x float> %4, float %5, i32 2 + %7 = load float* %pb, align 4, !tbaa !4 + %8 = insertelement <4 x float> %6, float %7, i32 3 + ret <4 x float> %8 +} + +; Function Attrs: nounwind readonly ssp uwtable +define <4 x float> @insertWc(<4 x float> %a, <4 x float>* nocapture readonly %pb) #0 { + %1 = load <4 x float>* %pb, align 16, !tbaa !1 + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> + ret <4 x float> %2 +} + +; Function Attrs: nounwind readnone ssp uwtable +define <4 x float> @insertWd(<4 x float> %a, <4 x float> %b) #2 { + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %1 +} + +; Function Attrs: nounwind readonly ssp uwtable +define <4 x float> @insertWe(<4 x float> %a, float* nocapture readonly %fb, i64 %index) #0 { + %1 = getelementptr inbounds float* %fb, i64 %index + %2 = load float* %1, align 4, !tbaa !4 + %3 = insertelement <4 x float> undef, float %2, i32 0 + %4 = insertelement <4 x float> %3, float %2, i32 1 + %5 = insertelement <4 x float> %4, float %2, i32 2 + %6 = insertelement <4 x float> %5, float %2, i32 3 + %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48) + ret <4 x float> %7 +} + +attributes #0 = { nounwind readonly ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readnone ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.5 "} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"omnipotent char", metadata !3, i64 0} +!3 = metadata !{metadata !"Simple C/C++ TBAA"} +!4 = metadata !{metadata !5, metadata !5, i64 0} +!5 = metadata !{metadata !"float", metadata !2, i64 0} Index: lib/Transforms/InstCombine/InstCombineSelect.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSelect.cpp +++ lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -731,6 +731,63 @@ return V; } +static bool CanFoldSelectOfShuffleVectors(SelectInst &SI, Value *CondVal, + Value *TrueVal, Value *FalseVal) { + VectorType *VecTy = dyn_cast(CondVal->getType()); + ShuffleVectorInst *TrueSV = dyn_cast(TrueVal); + ShuffleVectorInst *FalseSV = dyn_cast(FalseVal); + if (!VecTy && !TrueSV && !FalseSV) + return false; + + ConstantVector *CondV = dyn_cast(CondVal); + if (!CondV) + return false; + + Value *TrueV2 = TrueSV->getOperand(1); + Value *FalseV2 = FalseSV->getOperand(1); + // We just check for *V2 being undef since instcombine will turn + // shufflevector(undef, v) into shufflevector(v, undef) + if (!(isa(TrueV2) && isa(FalseV2))) + return false; + + // The source vectors (not the mask) for the shuffle vectors have to + // have the same type. Otherwise we could end up trying to do a + // shufflevector <4 x i32> <2 x i32> + Type *TrueShuffleSrc = TrueV2->getType(); + Type *FalseShuffleSrc = FalseV2->getType(); + if (TrueShuffleSrc != FalseShuffleSrc) + return false; + + return true; +} + +// This is instruction is only safe to call if CanFoldSelectOfShuffleVectors is +// true. +static Instruction *FoldSelectOfShuffleVectors(SelectInst &SI) { + SmallVector ShuffleMask; + ShuffleVectorInst *Sources[] = { cast(SI.getFalseValue()), + cast(SI.getTrueValue()) }; + ConstantVector *CondV = cast(SI.getCondition()); + + unsigned NumElems = cast(SI.getType())->getNumElements(); + unsigned NumSourceElems = + cast(Sources[0]->getOperand(0)->getType())->getNumElements(); + for (unsigned i = 0; i < NumElems; ++i) { + ConstantInt *Element = dyn_cast(CondV->getAggregateElement(i)); + if (!Element) + return nullptr; + + int Selector = Element->isOne(); + int SourceIdx = Sources[Selector]->getMaskValue(i); + ShuffleMask.push_back(SourceIdx == -1 ? -1 : SourceIdx + + NumSourceElems * Selector); + } + + return new ShuffleVectorInst( + Sources[0]->getOperand(0), Sources[1]->getOperand(0), + ConstantDataVector::get(SI.getContext(), ShuffleMask)); +} + Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { Value *CondVal = SI.getCondition(); Value *TrueVal = SI.getTrueValue(); @@ -1005,6 +1062,13 @@ if (isa(CondVal)) { return ReplaceInstUsesWith(SI, FalseVal); } + + // If all the elements with true in the mask correspond to undef on + // FalseV, and the reverse is true with false in the mask and TrueV, we + // can merge the shufflevectors and remove the select. + if (CanFoldSelectOfShuffleVectors(SI, CondVal, TrueVal, FalseVal)) + if (Instruction *I = FoldSelectOfShuffleVectors(SI)) + return I; } return nullptr; Index: test/Transforms/InstCombine/select.ll =================================================================== --- test/Transforms/InstCombine/select.ll +++ test/Transforms/InstCombine/select.ll @@ -1031,3 +1031,29 @@ ; CHECK: lshr exact i32 %2, 1 ; CHECK: xor i32 %3, 42 } + +define <4 x float> @add2f_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) { +; CHECK-LABEL: @add2f_0 +; CHECK-NOT: select +; CHECK: ret + %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> + %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> + %3 = select <4 x i1> , <4 x float> %2, <4 x float> %1 + %4 = shufflevector <2 x float> %b0, <2 x float> undef, <4 x i32> + %5 = shufflevector <2 x float> %b1, <2 x float> undef, <4 x i32> + %6 = select <4 x i1> , <4 x float> %5, <4 x float> %4 + %7 = fadd <4 x float> %3, %6 + ret <4 x float> %7 +} + +define <4 x float> @add2f_1(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) { +; CHECK-LABEL: @add2f_1 +; CHECK-NOT: select +; CHECK: ret + %1 = fadd <2 x float> %a0, %b0 + %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> + %3 = fadd <2 x float> %a1, %b1 + %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> + %5 = select <4 x i1> , <4 x float> %4, <4 x float> %2 + ret <4 x float> %5 +}