diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Twine.h" @@ -1983,6 +1984,49 @@ replaceInstUsesWith(CI, Res); return eraseInstFromFunction(CI); } + LLVM_FALLTHROUGH; + } + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_mul: + case Intrinsic::vector_reduce_xor: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_fadd: + case Intrinsic::vector_reduce_fmul: { + bool CanBeReassociated = (IID != Intrinsic::vector_reduce_fadd && + IID != Intrinsic::vector_reduce_fmul) || + II->hasAllowReassoc(); + const unsigned ArgIdx = (IID == Intrinsic::vector_reduce_fadd || + IID == Intrinsic::vector_reduce_fmul) + ? 1 + : 0; + Value *Arg = II->getArgOperand(ArgIdx); + Value *V; + ArrayRef Mask; + if (auto *FVTy = dyn_cast(Arg->getType())) + if (CanBeReassociated && + (match(Arg, m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))) || + match(Arg, m_Shuffle(m_Undef(), m_Value(V), m_Mask(Mask)))) && + cast(Arg)->isSingleSource()) { + int Sz = Mask.size(); + SmallBitVector UsedIndeces(Sz); + for (int Idx : Mask) { + int I = Idx >= Sz ? (Idx - Sz) : Idx; + if (Idx == UndefMaskElem || UsedIndeces.test(I)) + break; + UsedIndeces.set(I); + } + // Can remove shuffle iff just shuffled elements, no repeats, undefs, or + // other changes. + if (UsedIndeces.all()) { + replaceUse(II->getOperandUse(ArgIdx), V); + return nullptr; + } + } break; } default: { diff --git a/llvm/test/Transforms/InstCombine/reduction-shufflevector.ll b/llvm/test/Transforms/InstCombine/reduction-shufflevector.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/reduction-shufflevector.ll @@ -0,0 +1,269 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +define i32 @reduce_add(<4 x i32> %x) { +; CHECK-LABEL: @reduce_add( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_or(<4 x i32> %x) { +; CHECK-LABEL: @reduce_or( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> poison, <4 x i32> %x, <4 x i32> + %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_and(<4 x i32> %x) { +; CHECK-LABEL: @reduce_and( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_xor(<4 x i32> %x) { +; CHECK-LABEL: @reduce_xor( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> poison, <4 x i32> %x, <4 x i32> + %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_umax(<4 x i32> %x) { +; CHECK-LABEL: @reduce_umax( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_umin(<4 x i32> %x) { +; CHECK-LABEL: @reduce_umin( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_smax(<4 x i32> %x) { +; CHECK-LABEL: @reduce_smax( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_smin(<4 x i32> %x) { +; CHECK-LABEL: @reduce_smin( +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[X:%.*]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define float @reduce_fmax(<4 x float> %x) { +; CHECK-LABEL: @reduce_fmax( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[X:%.*]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %shuf) + ret float %res +} + +define float @reduce_fmin(<4 x float> %x) { +; CHECK-LABEL: @reduce_fmin( +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[X:%.*]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %shuf) + ret float %res +} + +define float @reduce_fadd(float %a, <4 x float> %x) { +; CHECK-LABEL: @reduce_fadd( +; CHECK-NEXT: [[RES:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float [[A:%.*]], <4 x float> [[X:%.*]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> %x, <4 x i32> + %res = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %a, <4 x float> %shuf) + ret float %res +} + +define float @reduce_fmul(float %a, <4 x float> %x) { +; CHECK-LABEL: @reduce_fmul( +; CHECK-NEXT: [[RES:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float [[A:%.*]], <4 x float> [[X:%.*]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> zeroinitializer, <4 x i32> + %res = call reassoc float @llvm.vector.reduce.fmul.v4f32(float %a, <4 x float> %shuf) + ret float %res +} + +; Failed cases + +define i32 @reduce_add_failed(<4 x i32> %x) { +; CHECK-LABEL: @reduce_add_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> %x, <4 x i32> + %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_or_failed(<4 x i32> %x) { +; CHECK-LABEL: @reduce_or_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> zeroinitializer, <4 x i32> + %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_and_failed(<4 x i32> %x) { +; CHECK-LABEL: @reduce_and_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_xor_failed(<4 x i32> %x) { +; CHECK-LABEL: @reduce_xor_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_umax_failed(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @reduce_umax_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> + %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_umin_failed(<2 x i32> %x) { +; CHECK-LABEL: @reduce_umin_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <2 x i32> %x, <2 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_smax_failed(<8 x i32> %x) { +; CHECK-LABEL: @reduce_smax_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> + %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define i32 @reduce_smin_failed(<8 x i32> %x) { +; CHECK-LABEL: @reduce_smin_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[SHUF]]) +; CHECK-NEXT: ret i32 [[RES]] +; + %shuf = shufflevector <8 x i32> %x, <8 x i32> %x, <4 x i32> + %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %shuf) + ret i32 %res +} + +define float @reduce_fmax_failed(<4 x float> %x) { +; CHECK-LABEL: @reduce_fmax_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[SHUF]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %shuf) + ret float %res +} + +define float @reduce_fmin_failed(<4 x float> %x) { +; CHECK-LABEL: @reduce_fmin_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[SHUF]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %shuf) + ret float %res +} + +define float @reduce_fadd_failed(float %a, <4 x float> %x) { +; CHECK-LABEL: @reduce_fadd_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[A:%.*]], <4 x float> [[SHUF]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> + %res = call float @llvm.vector.reduce.fadd.v4f32(float %a, <4 x float> %shuf) + ret float %res +} + +define float @reduce_fmul_failed(float %a, <2 x float> %x) { +; CHECK-LABEL: @reduce_fmul_failed( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float [[A:%.*]], <4 x float> [[SHUF]]) +; CHECK-NEXT: ret float [[RES]] +; + %shuf = shufflevector <2 x float> %x, <2 x float> poison, <4 x i32> + %res = call float @llvm.vector.reduce.fmul.v4f32(float %a, <4 x float> %shuf) + ret float %res +} + +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) +declare float @llvm.vector.reduce.fadd.v4f32(float %a, <4 x float> %b) +declare float @llvm.vector.reduce.fmul.v4f32(float %a, <4 x float> %b)