diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -922,7 +922,10 @@ setOperationAction(ISD::FREM, MVT::f128, Expand); } setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); - + setOperationAction(ISD::BSWAP, MVT::v8i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v4i32, Legal); + setOperationAction(ISD::BSWAP, MVT::v2i64, Legal); + setOperationAction(ISD::BSWAP, MVT::v1i128, Legal); } if (Subtarget.hasP9Altivec()) { diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -2948,19 +2948,25 @@ // Vector Byte-Reverse H/W/D/Q Word def XXBRH : XX2_XT6_XO5_XB6<60, 7, 475, "xxbrh", vsrc, []>; - def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, []>; - def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>; + def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc, + [(set v4i32:$XT, (bswap v4i32:$XB))]>; + def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, + [(set v2i64:$XT, (bswap v2i64:$XB))]>; def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>; // Vector Reverse def : Pat<(v8i16 (PPCxxreverse v8i16 :$A)), (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; + def : Pat<(v8i16 (bswap v8i16 :$A)), + (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; def : Pat<(v4i32 (PPCxxreverse v4i32 :$A)), (v4i32 (XXBRW $A))>; def : Pat<(v2i64 (PPCxxreverse v2i64 :$A)), (v2i64 (XXBRD $A))>; def : Pat<(v1i128 (PPCxxreverse v1i128 :$A)), (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; + def : Pat<(v1i128 (bswap v1i128 :$A)), + (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>; // Vector Permute def XXPERM : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -108,6 +108,11 @@ unsigned AddressSpace, bool UseMaskForCond = false, bool UseMaskForGaps = false); + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF); + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); /// @} }; diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -943,6 +943,20 @@ return Cost; } +unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); +} + +unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { + if (ID == Intrinsic::bswap && ST->hasP9Vector()) + return TLI->getTypeLegalizationCost(DL, RetTy).first; + return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); +} + bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) { diff --git a/llvm/test/CodeGen/PowerPC/vec-bswap.ll b/llvm/test/CodeGen/PowerPC/vec-bswap.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vec-bswap.ll @@ -0,0 +1,115 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -mcpu=pwr9 \ +; RUN: -verify-machineinstrs -ppc-asm-full-reg-names | FileCheck %s +define dso_local void @test(i32* %Arr, i32 signext %Len) { +; CHECK-LABEL: test: +; CHECK: lxvx [[REG:vs[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}} +; CHECK-NEXT: xxbrw vs{{[0-9]+}}, [[REG]] +entry: + %cmp1 = icmp slt i32 0, %Len + br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %min.iters.check = icmp ult i32 %Len, 4 + br i1 %min.iters.check, label %scalar.ph, label %vector.ph + +vector.ph: ; preds = %for.body.lr.ph + %n.mod.vf = urem i32 %Len, 4 + %n.vec = sub i32 %Len, %n.mod.vf + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %induction = add <4 x i32> %broadcast.splat, + %0 = add i32 %index, 0 + %1 = sext i32 %0 to i64 + %2 = getelementptr inbounds i32, i32* %Arr, i64 %1 + %3 = getelementptr inbounds i32, i32* %2, i32 0 + %4 = bitcast i32* %3 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %4, align 4 + %5 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %wide.load) + %6 = sext i32 %0 to i64 + %7 = getelementptr inbounds i32, i32* %Arr, i64 %6 + %8 = getelementptr inbounds i32, i32* %7, i32 0 + %9 = bitcast i32* %8 to <4 x i32>* + store <4 x i32> %5, <4 x i32>* %9, align 4 + %index.next = add i32 %index, 4 + %10 = icmp eq i32 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %Len, %n.vec + br i1 %cmp.n, label %for.cond.for.cond.cleanup_crit_edge, label %scalar.ph + +scalar.ph: ; preds = %middle.block, %for.body.lr.ph + %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %for.body.lr.ph ] + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: ; preds = %middle.block, %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + br label %for.end + +for.body: ; preds = %for.inc, %scalar.ph + %i.02 = phi i32 [ %bc.resume.val, %scalar.ph ], [ %inc, %for.inc ] + %idxprom = sext i32 %i.02 to i64 + %arrayidx = getelementptr inbounds i32, i32* %Arr, i64 %idxprom + %11 = load i32, i32* %arrayidx, align 4 + %12 = call i32 @llvm.bswap.i32(i32 %11) + %idxprom1 = sext i32 %i.02 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %Arr, i64 %idxprom1 + store i32 %12, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %Len + br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge + +for.end: ; preds = %for.cond.cleanup + ret void +} + +define dso_local <8 x i16> @test_halfword(<8 x i16> %a) local_unnamed_addr { +; CHECK-LABEL: test_halfword: +; CHECK: xxbrh vs34, vs34 +; CHECK-NEXT: blr +entry: + %0 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a) + ret <8 x i16> %0 +} + +define dso_local <2 x i64> @test_doubleword(<2 x i64> %a) local_unnamed_addr { +; CHECK-LABEL: test_doubleword: +; CHECK: xxbrd vs34, vs34 +; CHECK-NEXT: blr +entry: + %0 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a) + ret <2 x i64> %0 +} + +define dso_local <1 x i128> @test_quadword(<1 x i128> %a) local_unnamed_addr { +; CHECK-LABEL: test_quadword: +; CHECK: xxbrq vs34, vs34 +; CHECK-NEXT: blr +entry: + %0 = call <1 x i128> @llvm.bswap.v1i128(<1 x i128> %a) + ret <1 x i128> %0 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare <1 x i128> @llvm.bswap.v1i128(<1 x i128>) + +; Function Attrs: nounwind readnone speculatable willreturn +declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) + +; Function Attrs: nounwind readnone speculatable willreturn +declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.bswap.i32(i32) + +; Function Attrs: nounwind readnone speculatable willreturn +declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-bswap.ll @@ -0,0 +1,97 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-vectorize -mtriple=powerpc64le-unknown-unknown \ +; RUN: -force-target-max-vector-interleave=1 -mcpu=pwr9 < %s | FileCheck %s +define dso_local void @test(i32* %Arr, i32 signext %Len) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[LEN:%.*]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[LEN]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[LEN]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[LEN]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[LEN]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.for.cond.cleanup_crit_edge: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[I_02]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]]) +; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[I_02]] to i64 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 [[IDXPROM1]] +; CHECK-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_02]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[LEN]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], !llvm.loop !2 +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp1 = icmp slt i32 0, %Len + br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.inc + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + br label %for.end + +for.body: ; preds = %for.body.lr.ph, %for.inc + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] + %idxprom = sext i32 %i.02 to i64 + %arrayidx = getelementptr inbounds i32, i32* %Arr, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %1 = call i32 @llvm.bswap.i32(i32 %0) + %idxprom1 = sext i32 %i.02 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %Arr, i64 %idxprom1 + store i32 %1, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %Len + br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge + +for.end: ; preds = %for.cond.cleanup + ret void +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare i32 @llvm.bswap.i32(i32)