diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -680,6 +680,12 @@ // The third argument is set to true if the builtin accumulates its result into // its given accumulator. +// Provided builtins with _mma_ prefix for compatibility. +CUSTOM_BUILTIN(mma_lxvp, "W256SLLiW256C*", false) +CUSTOM_BUILTIN(mma_stxvp, "vW256SLLiW256C*", false) +CUSTOM_BUILTIN(mma_assemble_pair, "vW256*VV", false) +CUSTOM_BUILTIN(mma_disassemble_pair, "vv*W256*", false) + CUSTOM_BUILTIN(vsx_lxvp, "W256SLLiW256C*", false) CUSTOM_BUILTIN(vsx_stxvp, "vW256SLLiW256C*", false) CUSTOM_BUILTIN(vsx_assemble_pair, "vW256*VV", false) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -15340,9 +15340,12 @@ // return values. So, here we emit code extracting these values from the // intrinsic results and storing them using that pointer. if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc || - BuiltinID == PPC::BI__builtin_vsx_disassemble_pair) { + BuiltinID == PPC::BI__builtin_vsx_disassemble_pair || + BuiltinID == PPC::BI__builtin_mma_disassemble_pair) { unsigned NumVecs = 2; auto Intrinsic = Intrinsic::ppc_vsx_disassemble_pair; + if (BuiltinID == PPC::BI__builtin_mma_disassemble_pair) + Intrinsic = Intrinsic::ppc_mma_disassemble_pair; if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) { NumVecs = 4; Intrinsic = Intrinsic::ppc_mma_disassemble_acc; @@ -15371,8 +15374,11 @@ #include "clang/Basic/BuiltinsPPC.def" } if (BuiltinID == PPC::BI__builtin_vsx_lxvp || - BuiltinID == PPC::BI__builtin_vsx_stxvp) { - if (BuiltinID == PPC::BI__builtin_vsx_lxvp) { + BuiltinID == PPC::BI__builtin_mma_lxvp || + BuiltinID == PPC::BI__builtin_vsx_stxvp || + BuiltinID == PPC::BI__builtin_mma_stxvp) { + if (BuiltinID == PPC::BI__builtin_vsx_lxvp || + BuiltinID == PPC::BI__builtin_mma_lxvp) { Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy); Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]); } else { diff --git a/clang/test/CodeGen/builtins-ppc-pair-mma.c b/clang/test/CodeGen/builtins-ppc-pair-mma.c --- a/clang/test/CodeGen/builtins-ppc-pair-mma.c +++ b/clang/test/CodeGen/builtins-ppc-pair-mma.c @@ -1195,3 +1195,196 @@ __builtin_mma_xvf64gernp(&vq, vp, vc); *((__vector_quad *)resp) = vq; } + +// CHECK-LABEL: @test76( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> [[VC:%.*]], <16 x i8> [[VC]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[RESP:%.*]] to <256 x i1>* +// CHECK-NEXT: store <256 x i1> [[TMP0]], <256 x i1>* [[TMP1]], align 32, !tbaa !6 +// CHECK-NEXT: ret void +// +void test76(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = *((__vector_pair *)vpp); + __vector_pair res; + __builtin_mma_assemble_pair(&res, vc, vc); + *((__vector_pair *)resp) = res; +} + +// CHECK-LABEL: @test77( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VPP:%.*]] to <256 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, <256 x i1>* [[TMP0]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[RESP:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 0 +// CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP3]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP2]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[RESP]], i64 16 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP7]], align 16 +// CHECK-NEXT: ret void +// +void test77(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __builtin_mma_disassemble_pair(resp, (__vector_pair*)vpp); +} + +// CHECK-LABEL: @test78( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]]) +// CHECK-NEXT: ret void +// +void test78(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(0LL, vpp); + __builtin_mma_stxvp(vp, 0LL, vp2); +} + +// CHECK-LABEL: @test79( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[OFFSET:%.*]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[OFFSET]] +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test79(const __vector_pair *vpp, signed long long offset, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(offset, vpp); + __builtin_mma_stxvp(vp, offset, vp2); +} + +// CHECK-LABEL: @test80( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 18 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 18 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test80(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(18LL, vpp); + __builtin_mma_stxvp(vp, 18LL, vp2); +} + +// CHECK-LABEL: @test81( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 1 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test81(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(1LL, vpp); + __builtin_mma_stxvp(vp, 1LL, vp2); +} + +// CHECK-LABEL: @test82( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 42 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 42 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test82(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(42LL, vpp); + __builtin_mma_stxvp(vp, 42LL, vp2); +} + +// CHECK-LABEL: @test83( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VPP:%.*]], i64 128 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <256 x i1>* [[TMP0]] to i8* +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VP2:%.*]], i64 128 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <256 x i1>* [[TMP3]] to i8* +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test83(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(32768LL, vpp); + __builtin_mma_stxvp(vp, 32768LL, vp2); +} + +// CHECK-LABEL: @test84( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 32799 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8* +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 32799 +// CHECK-NEXT: tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]]) +// CHECK-NEXT: ret void +// +void test84(const __vector_pair *vpp, const __vector_pair *vp2) { + __vector_pair vp = __builtin_mma_lxvp(32799LL, vpp); + __builtin_mma_stxvp(vp, 32799LL, vp2); +} + +// CHECK-LABEL: @test85( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 8 +// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]], i32 0, i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]] +// CHECK-NEXT: ret void +// +void test85(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = __builtin_mma_lxvp(8LL, vpp); + __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test86( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, [[TBAA2]] +// CHECK-NEXT: ret void +// +void test86(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = __builtin_mma_lxvp(0LL, vpp); + __builtin_mma_xvf64gernp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + +// CHECK-LABEL: @test87( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>* +// CHECK-NEXT: [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8* +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[OFFS:%.*]] +// CHECK-NEXT: [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>* +// CHECK-NEXT: store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]] +// CHECK-NEXT: ret void +// +void test87(unsigned char *vqp, signed long long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) { + __vector_quad vq = *((__vector_quad *)vqp); + __vector_pair vp = __builtin_mma_lxvp(offs, vpp); + __builtin_mma_xvf64gernp(&vq, vp, vc); + *((__vector_quad *)resp) = vq; +} + diff --git a/clang/test/Sema/ppc-pair-mma-types.c b/clang/test/Sema/ppc-pair-mma-types.c --- a/clang/test/Sema/ppc-pair-mma-types.c +++ b/clang/test/Sema/ppc-pair-mma-types.c @@ -247,6 +247,7 @@ __vector_pair vp1 = *vpp; __vector_pair vp2; __builtin_vsx_assemble_pair(&vp2, vc, vc); + __builtin_mma_assemble_pair(&vp2, vc, vc); __vector_pair vp3; __vector_quad vq; __builtin_mma_xvf64ger(&vq, vp3, vc); @@ -321,12 +322,16 @@ void testBuiltinTypes1(const __vector_pair *vpp, const __vector_pair *vp2, float f) { __vector_pair vp = __builtin_vsx_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}} + __vector_pair vp1 = __builtin_mma_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}} __builtin_vsx_stxvp(vp, 32799, vp2); // expected-error {{passing 'int' to parameter of incompatible type 'long long'}} + __builtin_mma_stxvp(vp, 32799, vp2); // expected-error {{passing 'int' to parameter of incompatible type 'long long'}} } void testBuiltinTypes2(__vector_pair *vpp, const __vector_pair *vp2, unsigned char c) { __vector_pair vp = __builtin_vsx_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}} + __vector_pair vp1 = __builtin_mma_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}} __builtin_vsx_stxvp(vp, c, vp2); // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}} + __builtin_mma_stxvp(vp, c, vp2); // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}} } void testBuiltinTypes3(vector int v, __vector_pair *vp2, signed long long ll, unsigned short s) { diff --git a/clang/test/SemaCXX/ppc-pair-mma-types.cpp b/clang/test/SemaCXX/ppc-pair-mma-types.cpp --- a/clang/test/SemaCXX/ppc-pair-mma-types.cpp +++ b/clang/test/SemaCXX/ppc-pair-mma-types.cpp @@ -368,6 +368,7 @@ return *vpp; // expected-error {{invalid use of PPC MMA type}} }; auto f3 = [](vector unsigned char vc) { __vector_pair vp; __builtin_vsx_assemble_pair(&vp, vc, vc); return vp; }; // expected-error {{invalid use of PPC MMA type}} + auto f4 = [](vector unsigned char vc) { __vector_pair vp; __builtin_mma_assemble_pair(&vp, vc, vc); return vp; }; // expected-error {{invalid use of PPC MMA type}} } // cast diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td --- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -1136,6 +1136,8 @@ IntrArgMemOnly]>; def int_ppc_vsx_lxvp : Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>; +def int_ppc_mma_lxvp : + Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>; // Vector store. def int_ppc_vsx_stxvw4x : Intrinsic<[], [llvm_v4i32_ty, llvm_ptr_ty], @@ -1155,6 +1157,9 @@ def int_ppc_vsx_stxvp : Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], [IntrWriteMem, IntrArgMemOnly]>; +def int_ppc_mma_stxvp : + Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty], [IntrWriteMem, + IntrArgMemOnly]>; // Vector and scalar maximum. def int_ppc_vsx_xvmaxdp : PowerPC_VSX_Vec_DDD_Intrinsic<"xvmaxdp">; def int_ppc_vsx_xvmaxsp : PowerPC_VSX_Vec_FFF_Intrinsic<"xvmaxsp">; @@ -1417,10 +1422,18 @@ Intrinsic<[llvm_v256i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + def int_ppc_mma_assemble_pair : + Intrinsic<[llvm_v256i1_ty], + [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + def int_ppc_vsx_disassemble_pair : Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty], [llvm_v256i1_ty], [IntrNoMem]>; + def int_ppc_mma_disassemble_pair : + Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty], + [llvm_v256i1_ty], [IntrNoMem]>; + def int_ppc_mma_assemble_acc : Intrinsic<[llvm_v512i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10154,7 +10154,8 @@ return DAG.getRegister(PPC::R2, MVT::i32); case Intrinsic::ppc_mma_disassemble_acc: - case Intrinsic::ppc_vsx_disassemble_pair: { + case Intrinsic::ppc_vsx_disassemble_pair: + case Intrinsic::ppc_mma_disassemble_pair: { int NumVecs = 2; SDValue WideVec = Op.getOperand(1); if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) { diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -1623,6 +1623,8 @@ Concats.VecsToVecPair0>; def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)), Concats.VecsToVecPair0>; + def : Pat<(v256i1 (int_ppc_mma_assemble_pair v16i8:$vs1, v16i8:$vs0)), + Concats.VecsToVecPair0>; def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 0)), (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>; def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 1)), @@ -1664,18 +1666,27 @@ let Predicates = [PairedVectorMemops] in { // Intrinsics for Paired Vector Loads. def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; + def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>; def : Pat<(v256i1 (int_ppc_vsx_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>; + def : Pat<(v256i1 (int_ppc_mma_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>; let Predicates = [PairedVectorMemops, PrefixInstrs] in { def : Pat<(v256i1 (int_ppc_vsx_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; + def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>; } // Intrinsics for Paired Vector Stores. def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX16:$dst), (STXVP $XSp, memrix16:$dst)>; + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst), + (STXVP $XSp, memrix16:$dst)>; def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, xaddrX16:$dst), (STXVPX $XSp, xaddrX16:$dst)>; + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddrX16:$dst), + (STXVPX $XSp, xaddrX16:$dst)>; let Predicates = [PairedVectorMemops, PrefixInstrs] in { def : Pat<(int_ppc_vsx_stxvp v256i1:$XSp, iaddrX34:$dst), (PSTXVP $XSp, memri34:$dst)>; + def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst), + (PSTXVP $XSp, memri34:$dst)>; } } diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -276,9 +276,11 @@ return SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast(MemI)) { if (IMemI->getIntrinsicID() == Intrinsic::prefetch || - IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) + IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp || + IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) return IMemI->getArgOperand(0); - if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) + if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp || + IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) return IMemI->getArgOperand(1); } @@ -347,10 +349,12 @@ PtrValue = SMemI->getPointerOperand(); } else if (IntrinsicInst *IMemI = dyn_cast(&J)) { if (IMemI->getIntrinsicID() == Intrinsic::prefetch || - IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) { + IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp || + IMemI->getIntrinsicID() == Intrinsic::ppc_mma_lxvp) { MemI = IMemI; PtrValue = IMemI->getArgOperand(0); - } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp) { + } else if (IMemI->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp || + IMemI->getIntrinsicID() == Intrinsic::ppc_mma_stxvp) { MemI = IMemI; PtrValue = IMemI->getArgOperand(1); } else continue; @@ -835,7 +839,9 @@ // There are no update forms for P10 lxvp/stxvp intrinsic. auto *II = dyn_cast(I); if (II && ((II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp) || - II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp)) + II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp || + II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp || + II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp)) return false; // See getPreIndexedAddressParts, the displacement for LDU/STDU has to // be 4's multiple (DS-form). For i64 loads/stores when the displacement @@ -878,7 +884,9 @@ auto *II = dyn_cast(I); if (II) return II->getIntrinsicID() == Intrinsic::ppc_vsx_lxvp || - II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp; + II->getIntrinsicID() == Intrinsic::ppc_mma_lxvp || + II->getIntrinsicID() == Intrinsic::ppc_vsx_stxvp || + II->getIntrinsicID() == Intrinsic::ppc_mma_stxvp; // Check if it is a P9 vector load/store. return ST && ST->hasP9Vector() && (PtrValue->getType()->getPointerElementType()->isVectorTy()); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1275,7 +1275,8 @@ case Intrinsic::ppc_vsx_lxvw4x_be: case Intrinsic::ppc_vsx_lxvl: case Intrinsic::ppc_vsx_lxvll: - case Intrinsic::ppc_vsx_lxvp: { + case Intrinsic::ppc_vsx_lxvp: + case Intrinsic::ppc_mma_lxvp: { Info.PtrVal = Inst->getArgOperand(0); Info.ReadMem = true; Info.WriteMem = false; @@ -1292,7 +1293,8 @@ case Intrinsic::ppc_vsx_stxvw4x_be: case Intrinsic::ppc_vsx_stxvl: case Intrinsic::ppc_vsx_stxvll: - case Intrinsic::ppc_vsx_stxvp: { + case Intrinsic::ppc_vsx_stxvp: + case Intrinsic::ppc_mma_stxvp: { Info.PtrVal = Inst->getArgOperand(1); Info.ReadMem = false; Info.WriteMem = true; diff --git a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll --- a/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll +++ b/llvm/test/CodeGen/PowerPC/dform-pair-load-store.ll @@ -101,3 +101,95 @@ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) +define void @foo1(i32 zeroext %n, <256 x i1>* %ptr, <256 x i1>* %ptr2) { +; CHECK-LABEL: foo1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmplwi r3, 0 +; CHECK-NEXT: beqlr cr0 +; CHECK-NEXT: # %bb.1: # %for.body.lr.ph +; CHECK-NEXT: clrldi r6, r3, 32 +; CHECK-NEXT: addi r3, r4, 64 +; CHECK-NEXT: addi r4, r5, 64 +; CHECK-NEXT: mtctr r6 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lxvp vsp0, -64(r3) +; CHECK-NEXT: lxvp vsp2, -32(r3) +; CHECK-NEXT: lxvp vsp4, 0(r3) +; CHECK-NEXT: lxvp vsp6, 32(r3) +; CHECK-NEXT: addi r3, r3, 1 +; CHECK-NEXT: stxvp vsp0, -64(r4) +; CHECK-NEXT: stxvp vsp2, -32(r4) +; CHECK-NEXT: stxvp vsp4, 0(r4) +; CHECK-NEXT: stxvp vsp6, 32(r4) +; CHECK-NEXT: addi r4, r4, 1 +; CHECK-NEXT: bdnz .LBB1_2 +; CHECK-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: foo1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmplwi r3, 0 +; CHECK-BE-NEXT: beqlr cr0 +; CHECK-BE-NEXT: # %bb.1: # %for.body.lr.ph +; CHECK-BE-NEXT: clrldi r6, r3, 32 +; CHECK-BE-NEXT: addi r3, r4, 64 +; CHECK-BE-NEXT: addi r4, r5, 64 +; CHECK-BE-NEXT: mtctr r6 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB1_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxvp vsp0, -64(r3) +; CHECK-BE-NEXT: lxvp vsp2, -32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r3) +; CHECK-BE-NEXT: lxvp vsp6, 32(r3) +; CHECK-BE-NEXT: addi r3, r3, 1 +; CHECK-BE-NEXT: stxvp vsp0, -64(r4) +; CHECK-BE-NEXT: stxvp vsp2, -32(r4) +; CHECK-BE-NEXT: stxvp vsp4, 0(r4) +; CHECK-BE-NEXT: stxvp vsp6, 32(r4) +; CHECK-BE-NEXT: addi r4, r4, 1 +; CHECK-BE-NEXT: bdnz .LBB1_2 +; CHECK-BE-NEXT: # %bb.3: # %for.cond.cleanup +; CHECK-BE-NEXT: blr +entry: + %cmp35.not = icmp eq i32 %n, 0 + br i1 %cmp35.not, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: + %0 = bitcast <256 x i1>* %ptr to i8* + %1 = bitcast <256 x i1>* %ptr2 to i8* + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %2 = getelementptr i8, i8* %0, i64 %indvars.iv + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %add2 = add nuw nsw i64 %indvars.iv, 32 + %4 = getelementptr i8, i8* %0, i64 %add2 + %5 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %4) + %add4 = add nuw nsw i64 %indvars.iv, 64 + %6 = getelementptr i8, i8* %0, i64 %add4 + %7 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %6) + %add6 = add nuw nsw i64 %indvars.iv, 96 + %8 = getelementptr i8, i8* %0, i64 %add6 + %9 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %8) + %10 = getelementptr i8, i8* %1, i64 %indvars.iv + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %3, i8* %10) + %11 = getelementptr i8, i8* %1, i64 %add2 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %5, i8* %11) + %12 = getelementptr i8, i8* %1, i64 %add4 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %7, i8* %12) + %13 = getelementptr i8, i8* %1, i64 %add6 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %9, i8* %13) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll --- a/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/loop-p10-pair-prepare.ll @@ -112,3 +112,105 @@ declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) + +define void @foo1(i64* %.n, [0 x %_elem_type_of_x]* %.x, [0 x %_elem_type_of_y]* %.y, <2 x double>* %.sum) { +; CHECK-LABEL: foo1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld r5, 0(r3) +; CHECK-NEXT: cmpdi r5, 1 +; CHECK-NEXT: bltlr cr0 +; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph +; CHECK-NEXT: addi r3, r4, 1 +; CHECK-NEXT: addi r4, r5, -1 +; CHECK-NEXT: lxv vs0, 0(r6) +; CHECK-NEXT: rldicl r4, r4, 60, 4 +; CHECK-NEXT: addi r4, r4, 1 +; CHECK-NEXT: mtctr r4 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB1_2: # %_loop_1_do_ +; CHECK-NEXT: # +; CHECK-NEXT: lxvp vsp2, 0(r3) +; CHECK-NEXT: lxvp vsp4, 32(r3) +; CHECK-NEXT: addi r3, r3, 128 +; CHECK-NEXT: xvadddp vs0, vs0, vs3 +; CHECK-NEXT: xvadddp vs0, vs0, vs2 +; CHECK-NEXT: xvadddp vs0, vs0, vs5 +; CHECK-NEXT: xvadddp vs0, vs0, vs4 +; CHECK-NEXT: bdnz .LBB1_2 +; CHECK-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge +; CHECK-NEXT: stxv vs0, 0(r6) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: foo1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: ld r5, 0(r3) +; CHECK-BE-NEXT: cmpdi r5, 1 +; CHECK-BE-NEXT: bltlr cr0 +; CHECK-BE-NEXT: # %bb.1: # %_loop_1_do_.lr.ph +; CHECK-BE-NEXT: addi r3, r4, 1 +; CHECK-BE-NEXT: addi r4, r5, -1 +; CHECK-BE-NEXT: lxv vs0, 0(r6) +; CHECK-BE-NEXT: rldicl r4, r4, 60, 4 +; CHECK-BE-NEXT: addi r4, r4, 1 +; CHECK-BE-NEXT: mtctr r4 +; CHECK-BE-NEXT: .p2align 5 +; CHECK-BE-NEXT: .LBB1_2: # %_loop_1_do_ +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxvp vsp2, 0(r3) +; CHECK-BE-NEXT: lxvp vsp4, 32(r3) +; CHECK-BE-NEXT: addi r3, r3, 128 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs2 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs3 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs4 +; CHECK-BE-NEXT: xvadddp vs0, vs0, vs5 +; CHECK-BE-NEXT: bdnz .LBB1_2 +; CHECK-BE-NEXT: # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge +; CHECK-BE-NEXT: stxv vs0, 0(r6) +; CHECK-BE-NEXT: blr +entry: + %_val_n_2 = load i64, i64* %.n, align 8 + %_grt_tmp7 = icmp slt i64 %_val_n_2, 1 + br i1 %_grt_tmp7, label %_return_bb, label %_loop_1_do_.lr.ph + +_loop_1_do_.lr.ph: ; preds = %entry + %x_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %.sum.promoted = load <2 x double>, <2 x double>* %.sum, align 16 + br label %_loop_1_do_ + +_loop_1_do_: ; preds = %_loop_1_do_.lr.ph, %_loop_1_do_ + %_val_sum_9 = phi <2 x double> [ %.sum.promoted, %_loop_1_do_.lr.ph ], [ %_add_tmp49, %_loop_1_do_ ] + %i.08 = phi i64 [ 1, %_loop_1_do_.lr.ph ], [ %_loop_1_update_loop_ix, %_loop_1_do_ ] + %x_ix_dim_0_6 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_5, i64 %i.08 + %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_6 to i8* + %0 = getelementptr i8, i8* %x_ix_dim_0_, i64 1 + %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) + %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %1) + %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %2, 0 + %.fca.1.extract2 = extractvalue { <16 x i8>, <16 x i8> } %2, 1 + %3 = getelementptr i8, i8* %x_ix_dim_0_, i64 33 + %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) + %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %4) + %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 0 + %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 1 + %6 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double> + %_add_tmp23 = fadd contract <2 x double> %_val_sum_9, %6 + %7 = bitcast <16 x i8> %.fca.1.extract2 to <2 x double> + %_add_tmp32 = fadd contract <2 x double> %_add_tmp23, %7 + %8 = bitcast <16 x i8> %.fca.0.extract to <2 x double> + %_add_tmp40 = fadd contract <2 x double> %_add_tmp32, %8 + %9 = bitcast <16 x i8> %.fca.1.extract to <2 x double> + %_add_tmp49 = fadd contract <2 x double> %_add_tmp40, %9 + %_loop_1_update_loop_ix = add nuw nsw i64 %i.08, 16 + %_grt_tmp = icmp sgt i64 %_loop_1_update_loop_ix, %_val_n_2 + br i1 %_grt_tmp, label %_loop_1_loopHeader_._return_bb_crit_edge, label %_loop_1_do_ + +_loop_1_loopHeader_._return_bb_crit_edge: ; preds = %_loop_1_do_ + store <2 x double> %_add_tmp49, <2 x double>* %.sum, align 16 + br label %_return_bb + +_return_bb: ; preds = %_loop_1_loopHeader_._return_bb_crit_edge, %entry + ret void +} + +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll @@ -761,5 +761,143 @@ ret void } +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) + +; Function Attrs: nofree nounwind +define void @test_ldst_4(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: li r3, 8 +; CHECK-NEXT: lxvpx vsp4, r4, r3 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_4: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: li r3, 8 +; CHECK-BE-NEXT: lxvpx vsp4, r4, r3 +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: pmxvf64gernn acc0, vsp4, v2, 0, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = getelementptr i8, i8* %2, i64 8 + %4 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %3) + %5 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %1, <256 x i1> %4, <16 x i8> %vc, i32 0, i32 0) + %6 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %5, <512 x i1>* %6, align 64 + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_5(i8* nocapture readonly %vqp, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: lxvp vsp4, 0(r4) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r7) +; CHECK-NEXT: stxv vs1, 32(r7) +; CHECK-NEXT: stxv vs2, 16(r7) +; CHECK-NEXT: stxv vs3, 0(r7) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_5: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r4) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r7) +; CHECK-BE-NEXT: stxv vs0, 0(r7) +; CHECK-BE-NEXT: stxv vs3, 48(r7) +; CHECK-BE-NEXT: stxv vs2, 32(r7) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) + %5 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %4, <512 x i1>* %5, align 64 + ret void +} + +; Function Attrs: nofree nounwind +define void @test_ldst_6(i8* nocapture readonly %vqp, i64 %offs, <256 x i1>* %vpp, <16 x i8> %vc, i8* nocapture %resp) { +; CHECK-LABEL: test_ldst_6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 32(r3) +; CHECK-NEXT: lxv vs0, 48(r3) +; CHECK-NEXT: lxv vs3, 0(r3) +; CHECK-NEXT: lxv vs2, 16(r3) +; CHECK-NEXT: lxvp vsp4, 0(r5) +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r9) +; CHECK-NEXT: stxv vs1, 32(r9) +; CHECK-NEXT: stxv vs2, 16(r9) +; CHECK-NEXT: stxv vs3, 0(r9) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_6: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: lxv vs3, 48(r3) +; CHECK-BE-NEXT: lxv vs2, 32(r3) +; CHECK-BE-NEXT: lxvp vsp4, 0(r5) +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvf64gernp acc0, vsp4, v2 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r9) +; CHECK-BE-NEXT: stxv vs0, 0(r9) +; CHECK-BE-NEXT: stxv vs3, 48(r9) +; CHECK-BE-NEXT: stxv vs2, 32(r9) +; CHECK-BE-NEXT: blr +entry: + %0 = bitcast i8* %vqp to <512 x i1>* + %1 = load <512 x i1>, <512 x i1>* %0, align 64 + %2 = bitcast <256 x i1>* %vpp to i8* + %3 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %2) + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> %1, <256 x i1> %3, <16 x i8> %vc) + %5 = bitcast i8* %resp to <512 x i1>* + store <512 x i1> %4, <512 x i1>* %5, align 64 + ret void +} + declare <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1>, <256 x i1>, <16 x i8>, i32, i32) declare <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1>, <256 x i1>, <16 x i8>) diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll --- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll @@ -8,6 +8,7 @@ declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) +declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) { ; CHECK-LABEL: intrinsics1: ; CHECK: # %bb.0: @@ -69,6 +70,67 @@ ret void } +define void @intrinsics1_1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4, i8* %ptr) { +; CHECK-LABEL: intrinsics1_1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 +; CHECK-NEXT: vmr v1, v4 +; CHECK-NEXT: vmr v4, v3 +; CHECK-NEXT: ld r3, 96(r1) +; CHECK-NEXT: vmr v0, v2 +; CHECK-NEXT: xxlor vs5, v2, v2 +; CHECK-NEXT: xxlor vs4, v5, v5 +; CHECK-NEXT: xxlor vs0, v0, v0 +; CHECK-NEXT: xxlor vs1, v1, v1 +; CHECK-NEXT: xxlor vs2, v4, v4 +; CHECK-NEXT: xxlor vs3, v5, v5 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvi4ger8pp acc0, v2, v3 +; CHECK-NEXT: xvf16ger2pp acc0, v2, v1 +; CHECK-NEXT: pmxvf32gerpn acc0, v3, v5, 0, 0 +; CHECK-NEXT: pmxvf64gernp acc0, vsp4, v0, 0, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxvx vs3, 0, r3 +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: intrinsics1_1: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: # kill: def $v5 killed $v5 killed $vsrp18 def $vsrp18 +; CHECK-BE-NEXT: vmr v1, v4 +; CHECK-BE-NEXT: vmr v4, v3 +; CHECK-BE-NEXT: ld r3, 112(r1) +; CHECK-BE-NEXT: vmr v0, v2 +; CHECK-BE-NEXT: xxlor vs5, v2, v2 +; CHECK-BE-NEXT: xxlor vs4, v5, v5 +; CHECK-BE-NEXT: xxlor vs0, v0, v0 +; CHECK-BE-NEXT: xxlor vs1, v1, v1 +; CHECK-BE-NEXT: xxlor vs2, v4, v4 +; CHECK-BE-NEXT: xxlor vs3, v5, v5 +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvi4ger8pp acc0, v2, v3 +; CHECK-BE-NEXT: xvf16ger2pp acc0, v2, v1 +; CHECK-BE-NEXT: pmxvf32gerpn acc0, v3, v5, 0, 0 +; CHECK-BE-NEXT: pmxvf64gernp acc0, vsp4, v0, 0, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxvx vs0, 0, r3 +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: blr + %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4) + %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) + %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) + %4 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %3, <16 x i8> %vc2, <16 x i8> %vc4, i32 0, i32 0) + %5 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %vc4, <16 x i8> %vc1) + %6 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernp(<512 x i1> %4, <256 x i1> %5, <16 x i8> %vc1, i32 0, i32 0) + %7 = bitcast i8* %ptr to <512 x i1>* + store <512 x i1> %6, <512 x i1>* %7, align 64 + ret void +} + declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) define void @intrinsics2(<16 x i8>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3, <16 x i8>* %ptr4, i8* %ptr) { ; CHECK-LABEL: intrinsics2: @@ -141,6 +203,77 @@ ret void } +define void @intrinsics2_2(<16 x i8>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3, <16 x i8>* %ptr4, i8* %ptr) { +; CHECK-LABEL: intrinsics2_2: +; CHECK: # %bb.0: +; CHECK-NEXT: lxv vs4, 0(r3) +; CHECK-NEXT: lxv vs5, 0(r4) +; CHECK-NEXT: lxv vs6, 0(r5) +; CHECK-NEXT: lxv vs7, 0(r6) +; CHECK-NEXT: xxlor vs0, vs4, vs4 +; CHECK-NEXT: xxlor vs9, vs4, vs4 +; CHECK-NEXT: xxlor vs1, vs5, vs5 +; CHECK-NEXT: xxlor vs2, vs6, vs6 +; CHECK-NEXT: xxlor vs3, vs7, vs7 +; CHECK-NEXT: xxlor vs8, vs7, vs7 +; CHECK-NEXT: xxmtacc acc0 +; CHECK-NEXT: xvi8ger4pp acc0, vs4, vs5 +; CHECK-NEXT: xvf16ger2pn acc0, vs4, vs6 +; CHECK-NEXT: pmxvf32gernn acc0, vs5, vs7, 0, 0 +; CHECK-NEXT: pmxvf64gernn acc0, vsp8, vs4, 0, 0 +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: stxv vs2, 0(r4) +; CHECK-NEXT: stxv vs1, 0(r5) +; CHECK-NEXT: stxv vs0, 0(r6) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: intrinsics2_2: +; CHECK-BE: # %bb.0: +; CHECK-BE-NEXT: lxv vs4, 0(r3) +; CHECK-BE-NEXT: lxv vs5, 0(r4) +; CHECK-BE-NEXT: lxv vs6, 0(r5) +; CHECK-BE-NEXT: lxv vs7, 0(r6) +; CHECK-BE-NEXT: xxlor vs0, vs4, vs4 +; CHECK-BE-NEXT: xxlor vs9, vs4, vs4 +; CHECK-BE-NEXT: xxlor vs1, vs5, vs5 +; CHECK-BE-NEXT: xxlor vs2, vs6, vs6 +; CHECK-BE-NEXT: xxlor vs3, vs7, vs7 +; CHECK-BE-NEXT: xxlor vs8, vs7, vs7 +; CHECK-BE-NEXT: xxmtacc acc0 +; CHECK-BE-NEXT: xvi8ger4pp acc0, vs4, vs5 +; CHECK-BE-NEXT: xvf16ger2pn acc0, vs4, vs6 +; CHECK-BE-NEXT: pmxvf32gernn acc0, vs5, vs7, 0, 0 +; CHECK-BE-NEXT: pmxvf64gernn acc0, vsp8, vs4, 0, 0 +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs1, 0(r4) +; CHECK-BE-NEXT: stxv vs2, 0(r5) +; CHECK-BE-NEXT: stxv vs3, 0(r6) +; CHECK-BE-NEXT: blr + %vc1 = load <16 x i8>, <16 x i8>* %ptr1, align 16 + %vc2 = load <16 x i8>, <16 x i8>* %ptr2, align 16 + %vc3 = load <16 x i8>, <16 x i8>* %ptr3, align 16 + %vc4 = load <16 x i8>, <16 x i8>* %ptr4, align 16 + %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4) + %2 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2) + %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3) + %4 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %3, <16 x i8> %vc2, <16 x i8> %vc4, i32 0, i32 0) + %5 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %vc4, <16 x i8> %vc1) + %6 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> %4, <256 x i1> %5, <16 x i8> %vc1, i32 0, i32 0) + %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %6) + %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %7, 0 + %9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %7, 1 + %10 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %7, 2 + %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %7, 3 + %12 = bitcast i8* %ptr to <512 x i1>* + store <16 x i8> %8, <16 x i8>* %ptr1, align 16 + store <16 x i8> %9, <16 x i8>* %ptr2, align 16 + store <16 x i8> %10, <16 x i8>* %ptr3, align 16 + store <16 x i8> %11, <16 x i8>* %ptr4, align 16 + ret void +} + define void @test1(i8* %vqp, i8* %vpp, <16 x i8> %vc, i8* %resp) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll --- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll +++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll @@ -7,6 +7,7 @@ ; RUN: -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>) +declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) declare <512 x i1> @llvm.ppc.mma.xxsetaccz() declare <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1>, <256 x i1>, <16 x i8>) declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1>) @@ -104,6 +105,100 @@ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +define void @testPHI1_1(<16 x i8>* %Dst, <16 x i8>* %Src, i32 signext %Len) { +; CHECK-LABEL: testPHI1_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: cmpwi r5, 3 +; CHECK-NEXT: xxsetaccz acc0 +; CHECK-NEXT: blt cr0, .LBB1_3 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: clrldi r5, r5, 32 +; CHECK-NEXT: lxv vs4, 0(r4) +; CHECK-NEXT: lxv vs5, 16(r4) +; CHECK-NEXT: addi r4, r4, 32 +; CHECK-NEXT: addi r5, r5, -2 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lxv vs6, 0(r4) +; CHECK-NEXT: addi r4, r4, 16 +; CHECK-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-NEXT: bdnz .LBB1_2 +; CHECK-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testPHI1_1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: cmpwi r5, 3 +; CHECK-BE-NEXT: xxsetaccz acc0 +; CHECK-BE-NEXT: blt cr0, .LBB1_3 +; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-NEXT: clrldi r5, r5, 32 +; CHECK-BE-NEXT: lxv vs4, 0(r4) +; CHECK-BE-NEXT: lxv vs5, 16(r4) +; CHECK-BE-NEXT: addi r4, r4, 32 +; CHECK-BE-NEXT: addi r5, r5, -2 +; CHECK-BE-NEXT: mtctr r5 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB1_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxv vs6, 0(r4) +; CHECK-BE-NEXT: addi r4, r4, 16 +; CHECK-BE-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-BE-NEXT: bdnz .LBB1_2 +; CHECK-BE-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = load <16 x i8>, <16 x i8>* %Src, align 16 + %arrayidx1 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 1 + %1 = load <16 x i8>, <16 x i8>* %arrayidx1, align 16 + %2 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %0, <16 x i8> %1) + %3 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz() + %cmp11 = icmp sgt i32 %Len, 2 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %Len to i64 + br label %for.body + +for.cond.cleanup: + %Acc.0.lcssa = phi <512 x i1> [ %3, %entry ], [ %13, %for.body ] + %4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa) + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 0 + store <16 x i8> %5, <16 x i8>* %Dst, align 16 + %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 1 + %7 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 1 + store <16 x i8> %6, <16 x i8>* %7, align 16 + %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 2 + %9 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 2 + store <16 x i8> %8, <16 x i8>* %9, align 16 + %10 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, 3 + %11 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 3 + store <16 x i8> %10, <16 x i8>* %11, align 16 + ret void + +for.body: + %indvars.iv = phi i64 [ 2, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %Acc.012 = phi <512 x i1> [ %3, %for.body.preheader ], [ %13, %for.body ] + %arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 %indvars.iv + %12 = load <16 x i8>, <16 x i8>* %arrayidx2, align 16 + %13 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.012, <256 x i1> %2, <16 x i8> %12) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + declare <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1>, <16 x i8>) define dso_local void @testPHI2(<16 x i8>* %Dst, <16 x i8>* %Src, i32 signext %Len) { ; CHECK-LABEL: testPHI2: @@ -113,20 +208,20 @@ ; CHECK-NEXT: lxv vs6, 32(r4) ; CHECK-NEXT: cmpwi r5, 4 ; CHECK-NEXT: xvf64ger acc0, vsp4, vs6 -; CHECK-NEXT: blt cr0, .LBB1_3 +; CHECK-NEXT: blt cr0, .LBB2_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: clrldi r5, r5, 32 ; CHECK-NEXT: addi r4, r4, 48 ; CHECK-NEXT: addi r5, r5, -3 ; CHECK-NEXT: mtctr r5 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_2: # %for.body +; CHECK-NEXT: .LBB2_2: # %for.body ; CHECK-NEXT: # ; CHECK-NEXT: lxv vs6, 0(r4) ; CHECK-NEXT: addi r4, r4, 16 ; CHECK-NEXT: xvf64gerpp acc0, vsp4, vs6 -; CHECK-NEXT: bdnz .LBB1_2 -; CHECK-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-NEXT: bdnz .LBB2_2 +; CHECK-NEXT: .LBB2_3: # %for.cond.cleanup ; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxv vs3, 0(r3) ; CHECK-NEXT: stxv vs2, 16(r3) @@ -141,20 +236,20 @@ ; CHECK-BE-NEXT: lxv vs6, 32(r4) ; CHECK-BE-NEXT: cmpwi r5, 4 ; CHECK-BE-NEXT: xvf64ger acc0, vsp4, vs6 -; CHECK-BE-NEXT: blt cr0, .LBB1_3 +; CHECK-BE-NEXT: blt cr0, .LBB2_3 ; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader ; CHECK-BE-NEXT: clrldi r5, r5, 32 ; CHECK-BE-NEXT: addi r4, r4, 48 ; CHECK-BE-NEXT: addi r5, r5, -3 ; CHECK-BE-NEXT: mtctr r5 ; CHECK-BE-NEXT: .p2align 4 -; CHECK-BE-NEXT: .LBB1_2: # %for.body +; CHECK-BE-NEXT: .LBB2_2: # %for.body ; CHECK-BE-NEXT: # ; CHECK-BE-NEXT: lxv vs6, 0(r4) ; CHECK-BE-NEXT: addi r4, r4, 16 ; CHECK-BE-NEXT: xvf64gerpp acc0, vsp4, vs6 -; CHECK-BE-NEXT: bdnz .LBB1_2 -; CHECK-BE-NEXT: .LBB1_3: # %for.cond.cleanup +; CHECK-BE-NEXT: bdnz .LBB2_2 +; CHECK-BE-NEXT: .LBB2_3: # %for.cond.cleanup ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: stxv vs1, 16(r3) @@ -203,6 +298,104 @@ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +define dso_local void @testPHI2_2(<16 x i8>* %Dst, <16 x i8>* %Src, i32 signext %Len) { +; CHECK-LABEL: testPHI2_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs4, 0(r4) +; CHECK-NEXT: lxv vs5, 16(r4) +; CHECK-NEXT: lxv vs6, 32(r4) +; CHECK-NEXT: cmpwi r5, 4 +; CHECK-NEXT: xvf64ger acc0, vsp4, vs6 +; CHECK-NEXT: blt cr0, .LBB3_3 +; CHECK-NEXT: # %bb.1: # %for.body.preheader +; CHECK-NEXT: clrldi r5, r5, 32 +; CHECK-NEXT: addi r4, r4, 48 +; CHECK-NEXT: addi r5, r5, -3 +; CHECK-NEXT: mtctr r5 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB3_2: # %for.body +; CHECK-NEXT: # +; CHECK-NEXT: lxv vs6, 0(r4) +; CHECK-NEXT: addi r4, r4, 16 +; CHECK-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-NEXT: bdnz .LBB3_2 +; CHECK-NEXT: .LBB3_3: # %for.cond.cleanup +; CHECK-NEXT: xxmfacc acc0 +; CHECK-NEXT: stxv vs3, 0(r3) +; CHECK-NEXT: stxv vs2, 16(r3) +; CHECK-NEXT: stxv vs1, 32(r3) +; CHECK-NEXT: stxv vs0, 48(r3) +; CHECK-NEXT: blr +; +; CHECK-BE-LABEL: testPHI2_2: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs4, 0(r4) +; CHECK-BE-NEXT: lxv vs5, 16(r4) +; CHECK-BE-NEXT: lxv vs6, 32(r4) +; CHECK-BE-NEXT: cmpwi r5, 4 +; CHECK-BE-NEXT: xvf64ger acc0, vsp4, vs6 +; CHECK-BE-NEXT: blt cr0, .LBB3_3 +; CHECK-BE-NEXT: # %bb.1: # %for.body.preheader +; CHECK-BE-NEXT: clrldi r5, r5, 32 +; CHECK-BE-NEXT: addi r4, r4, 48 +; CHECK-BE-NEXT: addi r5, r5, -3 +; CHECK-BE-NEXT: mtctr r5 +; CHECK-BE-NEXT: .p2align 4 +; CHECK-BE-NEXT: .LBB3_2: # %for.body +; CHECK-BE-NEXT: # +; CHECK-BE-NEXT: lxv vs6, 0(r4) +; CHECK-BE-NEXT: addi r4, r4, 16 +; CHECK-BE-NEXT: xvf64gerpp acc0, vsp4, vs6 +; CHECK-BE-NEXT: bdnz .LBB3_2 +; CHECK-BE-NEXT: .LBB3_3: # %for.cond.cleanup +; CHECK-BE-NEXT: xxmfacc acc0 +; CHECK-BE-NEXT: stxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs2, 32(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: blr +entry: + %0 = load <16 x i8>, <16 x i8>* %Src, align 16 + %arrayidx1 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 1 + %1 = load <16 x i8>, <16 x i8>* %arrayidx1, align 16 + %2 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %0, <16 x i8> %1) + %arrayidx2 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 2 + %3 = load <16 x i8>, <16 x i8>* %arrayidx2, align 16 + %4 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %2, <16 x i8> %3) + %cmp14 = icmp sgt i32 %Len, 3 + br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %Len to i64 + br label %for.body + +for.cond.cleanup: + %Acc.0.lcssa = phi <512 x i1> [ %4, %entry ], [ %14, %for.body ] + %5 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %Acc.0.lcssa) + %6 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 0 + store <16 x i8> %6, <16 x i8>* %Dst, align 16 + %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 1 + %8 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 1 + store <16 x i8> %7, <16 x i8>* %8, align 16 + %9 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 2 + %10 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 2 + store <16 x i8> %9, <16 x i8>* %10, align 16 + %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %5, 3 + %12 = getelementptr inbounds <16 x i8>, <16 x i8>* %Dst, i64 3 + store <16 x i8> %11, <16 x i8>* %12, align 16 + ret void + +for.body: + %indvars.iv = phi i64 [ 3, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %Acc.015 = phi <512 x i1> [ %4, %for.body.preheader ], [ %14, %for.body ] + %arrayidx3 = getelementptr inbounds <16 x i8>, <16 x i8>* %Src, i64 %indvars.iv + %13 = load <16 x i8>, <16 x i8>* %arrayidx3, align 16 + %14 = tail call <512 x i1> @llvm.ppc.mma.xvf64gerpp(<512 x i1> %Acc.015, <256 x i1> %2, <16 x i8> %13) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + ; This test uses an unprimed accumulator PHI node with two operands: an ; implicitely defined unprimed accumulator and the unprimed result of the call ; to xvf64gerpp. The compiler should replace this PHI node by a primed @@ -211,10 +404,10 @@ ; CHECK-LABEL: testImplicitDef: ; CHECK: # %bb.0: # %label1 ; CHECK-NEXT: # implicit-def: $acc0 -; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-NEXT: bc 12, 4*cr5+lt, .LBB4_2 ; CHECK-NEXT: # %bb.1: # %label2 ; CHECK-NEXT: xvf64gerpp acc0, vsp0, vs0 -; CHECK-NEXT: .LBB2_2: # %label3 +; CHECK-NEXT: .LBB4_2: # %label3 ; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxv vs0, 0(r3) ; CHECK-NEXT: blr @@ -222,10 +415,10 @@ ; CHECK-BE-LABEL: testImplicitDef: ; CHECK-BE: # %bb.0: # %label1 ; CHECK-BE-NEXT: # implicit-def: $acc0 -; CHECK-BE-NEXT: bc 12, 4*cr5+lt, .LBB2_2 +; CHECK-BE-NEXT: bc 12, 4*cr5+lt, .LBB4_2 ; CHECK-BE-NEXT: # %bb.1: # %label2 ; CHECK-BE-NEXT: xvf64gerpp acc0, vsp0, vs0 -; CHECK-BE-NEXT: .LBB2_2: # %label3 +; CHECK-BE-NEXT: .LBB4_2: # %label3 ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs3, 0(r3) ; CHECK-BE-NEXT: blr @@ -252,25 +445,25 @@ ; CHECK-LABEL: testNestedPHI: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmplwi r3, 0 -; CHECK-NEXT: beq cr0, .LBB3_2 +; CHECK-NEXT: beq cr0, .LBB5_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: xvf32gernp acc0, v2, v2 ; CHECK-NEXT: cmpwi r4, 1 -; CHECK-NEXT: bge cr0, .LBB3_3 -; CHECK-NEXT: b .LBB3_5 -; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: bge cr0, .LBB5_3 +; CHECK-NEXT: b .LBB5_5 +; CHECK-NEXT: .LBB5_2: ; CHECK-NEXT: # implicit-def: $acc0 ; CHECK-NEXT: cmpwi r4, 1 -; CHECK-NEXT: blt cr0, .LBB3_5 -; CHECK-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-NEXT: blt cr0, .LBB5_5 +; CHECK-NEXT: .LBB5_3: # %for.body.preheader ; CHECK-NEXT: clrldi r3, r4, 32 ; CHECK-NEXT: mtctr r3 ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB3_4: # %for.body +; CHECK-NEXT: .LBB5_4: # %for.body ; CHECK-NEXT: # ; CHECK-NEXT: xvf32gernp acc0, v2, v2 -; CHECK-NEXT: bdnz .LBB3_4 -; CHECK-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-NEXT: bdnz .LBB5_4 +; CHECK-NEXT: .LBB5_5: # %for.cond.cleanup ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: xxmfacc acc0 ; CHECK-NEXT: stxv vs0, 48(r5) @@ -282,25 +475,25 @@ ; CHECK-BE-LABEL: testNestedPHI: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: cmplwi r3, 0 -; CHECK-BE-NEXT: beq cr0, .LBB3_2 +; CHECK-BE-NEXT: beq cr0, .LBB5_2 ; CHECK-BE-NEXT: # %bb.1: # %if.then ; CHECK-BE-NEXT: xvf32gernp acc0, v2, v2 ; CHECK-BE-NEXT: cmpwi r4, 1 -; CHECK-BE-NEXT: bge cr0, .LBB3_3 -; CHECK-BE-NEXT: b .LBB3_5 -; CHECK-BE-NEXT: .LBB3_2: +; CHECK-BE-NEXT: bge cr0, .LBB5_3 +; CHECK-BE-NEXT: b .LBB5_5 +; CHECK-BE-NEXT: .LBB5_2: ; CHECK-BE-NEXT: # implicit-def: $acc0 ; CHECK-BE-NEXT: cmpwi r4, 1 -; CHECK-BE-NEXT: blt cr0, .LBB3_5 -; CHECK-BE-NEXT: .LBB3_3: # %for.body.preheader +; CHECK-BE-NEXT: blt cr0, .LBB5_5 +; CHECK-BE-NEXT: .LBB5_3: # %for.body.preheader ; CHECK-BE-NEXT: clrldi r3, r4, 32 ; CHECK-BE-NEXT: mtctr r3 ; CHECK-BE-NEXT: .p2align 4 -; CHECK-BE-NEXT: .LBB3_4: # %for.body +; CHECK-BE-NEXT: .LBB5_4: # %for.body ; CHECK-BE-NEXT: # ; CHECK-BE-NEXT: xvf32gernp acc0, v2, v2 -; CHECK-BE-NEXT: bdnz .LBB3_4 -; CHECK-BE-NEXT: .LBB3_5: # %for.cond.cleanup +; CHECK-BE-NEXT: bdnz .LBB5_4 +; CHECK-BE-NEXT: .LBB5_5: # %for.cond.cleanup ; CHECK-BE-NEXT: li r3, 0 ; CHECK-BE-NEXT: xxmfacc acc0 ; CHECK-BE-NEXT: stxv vs1, 16(r5) diff --git a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll --- a/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll +++ b/llvm/test/CodeGen/PowerPC/more-dq-form-prepare.ll @@ -640,7 +640,642 @@ ret void } +define void @foo1(i32* %.m, i32* %.n, [0 x %_elem_type_of_a]* %.a, [0 x %_elem_type_of_x]* %.x, i32* %.l, <2 x double>* %.vy01, <2 x double>* %.vy02, <2 x double>* %.vy03, <2 x double>* %.vy04, <2 x double>* %.vy05, <2 x double>* %.vy06, <2 x double>* %.vy07, <2 x double>* %.vy08, <2 x double>* %.vy09, <2 x double>* %.vy0a, <2 x double>* %.vy0b, <2 x double>* %.vy0c, <2 x double>* %.vy21, <2 x double>* %.vy22, <2 x double>* %.vy23, <2 x double>* %.vy24, <2 x double>* %.vy25, <2 x double>* %.vy26, <2 x double>* %.vy27, <2 x double>* %.vy28, <2 x double>* %.vy29, <2 x double>* %.vy2a, <2 x double>* %.vy2b, <2 x double>* %.vy2c) { +; CHECK-LABEL: foo1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stdu 1, -480(1) +; CHECK-NEXT: .cfi_def_cfa_offset 480 +; CHECK-NEXT: .cfi_offset r14, -256 +; CHECK-NEXT: .cfi_offset r15, -248 +; CHECK-NEXT: .cfi_offset r16, -240 +; CHECK-NEXT: .cfi_offset r17, -232 +; CHECK-NEXT: .cfi_offset r18, -224 +; CHECK-NEXT: .cfi_offset r19, -216 +; CHECK-NEXT: .cfi_offset r20, -208 +; CHECK-NEXT: .cfi_offset r21, -200 +; CHECK-NEXT: .cfi_offset r22, -192 +; CHECK-NEXT: .cfi_offset r23, -184 +; CHECK-NEXT: .cfi_offset r24, -176 +; CHECK-NEXT: .cfi_offset r25, -168 +; CHECK-NEXT: .cfi_offset r26, -160 +; CHECK-NEXT: .cfi_offset r27, -152 +; CHECK-NEXT: .cfi_offset r28, -144 +; CHECK-NEXT: .cfi_offset r29, -136 +; CHECK-NEXT: .cfi_offset r30, -128 +; CHECK-NEXT: .cfi_offset r31, -120 +; CHECK-NEXT: .cfi_offset f18, -112 +; CHECK-NEXT: .cfi_offset f19, -104 +; CHECK-NEXT: .cfi_offset f20, -96 +; CHECK-NEXT: .cfi_offset f21, -88 +; CHECK-NEXT: .cfi_offset f22, -80 +; CHECK-NEXT: .cfi_offset f23, -72 +; CHECK-NEXT: .cfi_offset f24, -64 +; CHECK-NEXT: .cfi_offset f25, -56 +; CHECK-NEXT: .cfi_offset f26, -48 +; CHECK-NEXT: .cfi_offset f27, -40 +; CHECK-NEXT: .cfi_offset f28, -32 +; CHECK-NEXT: .cfi_offset f29, -24 +; CHECK-NEXT: .cfi_offset f30, -16 +; CHECK-NEXT: .cfi_offset f31, -8 +; CHECK-NEXT: lwz 4, 0(4) +; CHECK-NEXT: std 14, 224(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 232(1) # 8-byte Folded Spill +; CHECK-NEXT: cmpwi 4, 1 +; CHECK-NEXT: std 16, 240(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 248(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 256(1) # 8-byte Folded Spill +; CHECK-NEXT: std 19, 264(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 272(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 280(1) # 8-byte Folded Spill +; CHECK-NEXT: std 22, 288(1) # 8-byte Folded Spill +; CHECK-NEXT: std 23, 296(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 304(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 312(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 320(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 328(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 336(1) # 8-byte Folded Spill +; CHECK-NEXT: std 29, 344(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, 352(1) # 8-byte Folded Spill +; CHECK-NEXT: std 31, 360(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 18, 368(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 19, 376(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 20, 384(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 21, 392(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 22, 400(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 23, 408(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 24, 416(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 25, 424(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 26, 432(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 27, 440(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 28, 448(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 29, 456(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 30, 464(1) # 8-byte Folded Spill +; CHECK-NEXT: stfd 31, 472(1) # 8-byte Folded Spill +; CHECK-NEXT: blt 0, .LBB1_7 +; CHECK-NEXT: # %bb.1: # %_loop_1_do_.lr.ph +; CHECK-NEXT: mr 23, 5 +; CHECK-NEXT: lwz 5, 0(3) +; CHECK-NEXT: cmpwi 5, 1 +; CHECK-NEXT: blt 0, .LBB1_7 +; CHECK-NEXT: # %bb.2: # %_loop_1_do_.preheader +; CHECK-NEXT: addi 5, 5, 1 +; CHECK-NEXT: li 20, 9 +; CHECK-NEXT: ld 28, 728(1) +; CHECK-NEXT: ld 19, 616(1) +; CHECK-NEXT: lwa 3, 0(7) +; CHECK-NEXT: ld 7, 688(1) +; CHECK-NEXT: ld 12, 680(1) +; CHECK-NEXT: ld 11, 672(1) +; CHECK-NEXT: ld 2, 664(1) +; CHECK-NEXT: ld 29, 736(1) +; CHECK-NEXT: cmpldi 5, 9 +; CHECK-NEXT: ld 27, 720(1) +; CHECK-NEXT: ld 26, 712(1) +; CHECK-NEXT: ld 25, 704(1) +; CHECK-NEXT: ld 24, 696(1) +; CHECK-NEXT: iselgt 5, 5, 20 +; CHECK-NEXT: ld 30, 656(1) +; CHECK-NEXT: ld 22, 648(1) +; CHECK-NEXT: ld 21, 640(1) +; CHECK-NEXT: ld 20, 632(1) +; CHECK-NEXT: ld 18, 608(1) +; CHECK-NEXT: ld 17, 600(1) +; CHECK-NEXT: ld 16, 592(1) +; CHECK-NEXT: ld 14, 584(1) +; CHECK-NEXT: sldi 0, 3, 2 +; CHECK-NEXT: std 5, 216(1) # 8-byte Folded Spill +; CHECK-NEXT: std 28, 208(1) # 8-byte Folded Spill +; CHECK-NEXT: mr 5, 4 +; CHECK-NEXT: ld 4, 624(1) +; CHECK-NEXT: std 19, 96(1) # 8-byte Folded Spill +; CHECK-NEXT: std 4, 104(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 11, 0(4) +; CHECK-NEXT: mr 4, 5 +; CHECK-NEXT: ld 5, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 576(1) +; CHECK-NEXT: sldi 31, 3, 1 +; CHECK-NEXT: std 8, 32(1) # 8-byte Folded Spill +; CHECK-NEXT: std 9, 40(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 41, 0(8) +; CHECK-NEXT: mr 8, 6 +; CHECK-NEXT: sldi 6, 3, 3 +; CHECK-NEXT: std 2, 144(1) # 8-byte Folded Spill +; CHECK-NEXT: std 11, 152(1) # 8-byte Folded Spill +; CHECK-NEXT: lxv 3, 0(2) +; CHECK-NEXT: lxv 2, 0(11) +; CHECK-NEXT: lxv 0, 0(7) +; CHECK-NEXT: add 6, 6, 23 +; CHECK-NEXT: lxv 7, 0(28) +; CHECK-NEXT: add 28, 3, 31 +; CHECK-NEXT: lxv 40, 0(9) +; CHECK-NEXT: lxv 39, 0(10) +; CHECK-NEXT: lxv 38, 0(15) +; CHECK-NEXT: lxv 33, 0(14) +; CHECK-NEXT: lxv 32, 0(16) +; CHECK-NEXT: lxv 37, 0(17) +; CHECK-NEXT: lxv 35, 0(18) +; CHECK-NEXT: lxv 13, 0(19) +; CHECK-NEXT: lxv 10, 0(20) +; CHECK-NEXT: lxv 8, 0(21) +; CHECK-NEXT: lxv 6, 0(22) +; CHECK-NEXT: lxv 4, 0(30) +; CHECK-NEXT: lxv 1, 0(12) +; CHECK-NEXT: lxv 36, 0(24) +; CHECK-NEXT: lxv 34, 0(25) +; CHECK-NEXT: lxv 12, 0(26) +; CHECK-NEXT: lxv 9, 0(27) +; CHECK-NEXT: lxv 5, 0(29) +; CHECK-NEXT: addi 5, 5, -2 +; CHECK-NEXT: sldi 11, 3, 4 +; CHECK-NEXT: std 12, 160(1) # 8-byte Folded Spill +; CHECK-NEXT: std 7, 168(1) # 8-byte Folded Spill +; CHECK-NEXT: add 7, 3, 0 +; CHECK-NEXT: add 12, 11, 23 +; CHECK-NEXT: addi 11, 6, 32 +; CHECK-NEXT: addi 12, 12, 32 +; CHECK-NEXT: std 22, 128(1) # 8-byte Folded Spill +; CHECK-NEXT: std 30, 136(1) # 8-byte Folded Spill +; CHECK-NEXT: std 26, 192(1) # 8-byte Folded Spill +; CHECK-NEXT: std 27, 200(1) # 8-byte Folded Spill +; CHECK-NEXT: mulli 26, 3, 48 +; CHECK-NEXT: mulli 22, 3, 6 +; CHECK-NEXT: sldi 6, 7, 3 +; CHECK-NEXT: add 30, 23, 6 +; CHECK-NEXT: std 29, 216(1) # 8-byte Folded Spill +; CHECK-NEXT: std 24, 176(1) # 8-byte Folded Spill +; CHECK-NEXT: std 25, 184(1) # 8-byte Folded Spill +; CHECK-NEXT: li 25, 1 +; CHECK-NEXT: li 24, 0 +; CHECK-NEXT: std 10, 48(1) # 8-byte Folded Spill +; CHECK-NEXT: std 15, 56(1) # 8-byte Folded Spill +; CHECK-NEXT: std 14, 64(1) # 8-byte Folded Spill +; CHECK-NEXT: std 16, 72(1) # 8-byte Folded Spill +; CHECK-NEXT: std 17, 80(1) # 8-byte Folded Spill +; CHECK-NEXT: std 18, 88(1) # 8-byte Folded Spill +; CHECK-NEXT: std 20, 112(1) # 8-byte Folded Spill +; CHECK-NEXT: std 21, 120(1) # 8-byte Folded Spill +; CHECK-NEXT: rldicl 5, 5, 61, 3 +; CHECK-NEXT: addi 2, 5, 1 +; CHECK-NEXT: sldi 5, 3, 5 +; CHECK-NEXT: add 29, 23, 5 +; CHECK-NEXT: sldi 5, 28, 3 +; CHECK-NEXT: add 27, 23, 5 +; CHECK-NEXT: mr 5, 23 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_3: # %_loop_2_do_.lr.ph +; CHECK-NEXT: # =>This Loop Header: Depth=1 +; CHECK-NEXT: # Child Loop BB1_4 Depth 2 +; CHECK-NEXT: maddld 6, 22, 24, 7 +; CHECK-NEXT: maddld 20, 22, 24, 0 +; CHECK-NEXT: mtctr 2 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 21, 23, 6 +; CHECK-NEXT: sldi 6, 20, 3 +; CHECK-NEXT: add 20, 23, 6 +; CHECK-NEXT: maddld 6, 22, 24, 28 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 19, 23, 6 +; CHECK-NEXT: maddld 6, 22, 24, 31 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 18, 23, 6 +; CHECK-NEXT: maddld 6, 22, 24, 3 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 17, 23, 6 +; CHECK-NEXT: mulld 6, 22, 24 +; CHECK-NEXT: sldi 6, 6, 3 +; CHECK-NEXT: add 16, 23, 6 +; CHECK-NEXT: mr 6, 8 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB1_4: # %_loop_2_do_ +; CHECK-NEXT: # Parent Loop BB1_3 Depth=1 +; CHECK-NEXT: # => This Inner Loop Header: Depth=2 +; CHECK-NEXT: lxvp 42, 0(6) +; CHECK-NEXT: lxvp 44, 0(16) +; CHECK-NEXT: lxvp 46, 0(17) +; CHECK-NEXT: lxvp 48, 0(18) +; CHECK-NEXT: lxvp 50, 0(19) +; CHECK-NEXT: lxvp 30, 0(20) +; CHECK-NEXT: lxvp 28, 0(21) +; CHECK-NEXT: lxvp 26, 32(6) +; CHECK-NEXT: lxvp 24, 32(16) +; CHECK-NEXT: lxvp 22, 32(17) +; CHECK-NEXT: lxvp 20, 32(18) +; CHECK-NEXT: lxvp 18, 32(19) +; CHECK-NEXT: addi 6, 6, 64 +; CHECK-NEXT: addi 16, 16, 64 +; CHECK-NEXT: addi 17, 17, 64 +; CHECK-NEXT: addi 18, 18, 64 +; CHECK-NEXT: addi 19, 19, 64 +; CHECK-NEXT: xvmaddadp 41, 45, 43 +; CHECK-NEXT: xvmaddadp 40, 47, 43 +; CHECK-NEXT: xvmaddadp 39, 49, 43 +; CHECK-NEXT: xvmaddadp 38, 51, 43 +; CHECK-NEXT: xvmaddadp 33, 31, 43 +; CHECK-NEXT: xvmaddadp 32, 29, 43 +; CHECK-NEXT: xvmaddadp 37, 44, 42 +; CHECK-NEXT: xvmaddadp 35, 46, 42 +; CHECK-NEXT: xvmaddadp 13, 48, 42 +; CHECK-NEXT: xvmaddadp 11, 50, 42 +; CHECK-NEXT: xvmaddadp 10, 30, 42 +; CHECK-NEXT: xvmaddadp 8, 28, 42 +; CHECK-NEXT: lxvp 42, 32(20) +; CHECK-NEXT: lxvp 44, 32(21) +; CHECK-NEXT: addi 20, 20, 64 +; CHECK-NEXT: addi 21, 21, 64 +; CHECK-NEXT: xvmaddadp 6, 25, 27 +; CHECK-NEXT: xvmaddadp 4, 23, 27 +; CHECK-NEXT: xvmaddadp 3, 21, 27 +; CHECK-NEXT: xvmaddadp 2, 19, 27 +; CHECK-NEXT: xvmaddadp 36, 24, 26 +; CHECK-NEXT: xvmaddadp 34, 22, 26 +; CHECK-NEXT: xvmaddadp 12, 20, 26 +; CHECK-NEXT: xvmaddadp 9, 18, 26 +; CHECK-NEXT: xvmaddadp 1, 43, 27 +; CHECK-NEXT: xvmaddadp 0, 45, 27 +; CHECK-NEXT: xvmaddadp 7, 42, 26 +; CHECK-NEXT: xvmaddadp 5, 44, 26 +; CHECK-NEXT: bdnz .LBB1_4 +; CHECK-NEXT: # %bb.5: # %_loop_2_endl_ +; CHECK-NEXT: # +; CHECK-NEXT: addi 25, 25, 6 +; CHECK-NEXT: add 5, 5, 26 +; CHECK-NEXT: add 11, 11, 26 +; CHECK-NEXT: add 30, 30, 26 +; CHECK-NEXT: add 12, 12, 26 +; CHECK-NEXT: add 29, 29, 26 +; CHECK-NEXT: add 27, 27, 26 +; CHECK-NEXT: addi 24, 24, 1 +; CHECK-NEXT: cmpld 25, 4 +; CHECK-NEXT: ble 0, .LBB1_3 +; CHECK-NEXT: # %bb.6: # %_loop_1_loopHeader_._return_bb_crit_edge.loopexit +; CHECK-NEXT: ld 3, 32(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 41, 0(3) +; CHECK-NEXT: ld 3, 40(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 40, 0(3) +; CHECK-NEXT: ld 3, 48(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 39, 0(3) +; CHECK-NEXT: ld 3, 56(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 38, 0(3) +; CHECK-NEXT: ld 3, 64(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 33, 0(3) +; CHECK-NEXT: ld 3, 72(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 32, 0(3) +; CHECK-NEXT: ld 3, 80(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 37, 0(3) +; CHECK-NEXT: ld 3, 88(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 35, 0(3) +; CHECK-NEXT: ld 3, 96(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 13, 0(3) +; CHECK-NEXT: ld 3, 104(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 11, 0(3) +; CHECK-NEXT: ld 3, 112(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 10, 0(3) +; CHECK-NEXT: ld 3, 120(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 8, 0(3) +; CHECK-NEXT: ld 3, 128(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 6, 0(3) +; CHECK-NEXT: ld 3, 136(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 4, 0(3) +; CHECK-NEXT: ld 3, 144(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 3, 0(3) +; CHECK-NEXT: ld 3, 152(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 2, 0(3) +; CHECK-NEXT: ld 3, 160(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 1, 0(3) +; CHECK-NEXT: ld 3, 168(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 0, 0(3) +; CHECK-NEXT: ld 3, 176(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 36, 0(3) +; CHECK-NEXT: ld 3, 184(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 34, 0(3) +; CHECK-NEXT: ld 3, 192(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 12, 0(3) +; CHECK-NEXT: ld 3, 200(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 9, 0(3) +; CHECK-NEXT: ld 3, 208(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 7, 0(3) +; CHECK-NEXT: ld 3, 216(1) # 8-byte Folded Reload +; CHECK-NEXT: stxv 5, 0(3) +; CHECK-NEXT: .LBB1_7: # %_return_bb +; CHECK-NEXT: lfd 31, 472(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 30, 464(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 31, 360(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 30, 352(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 29, 344(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 28, 336(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 27, 328(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 26, 320(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 25, 312(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 29, 456(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 24, 304(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 23, 296(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 22, 288(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 28, 448(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 21, 280(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 20, 272(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 19, 264(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 27, 440(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 18, 256(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 17, 248(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 16, 240(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 26, 432(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 15, 232(1) # 8-byte Folded Reload +; CHECK-NEXT: ld 14, 224(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 25, 424(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 24, 416(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 23, 408(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 22, 400(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 21, 392(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 20, 384(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 19, 376(1) # 8-byte Folded Reload +; CHECK-NEXT: lfd 18, 368(1) # 8-byte Folded Reload +; CHECK-NEXT: addi 1, 1, 480 +; CHECK-NEXT: blr +entry: + %_val_l_ = load i32, i32* %.l, align 4 + %_conv = sext i32 %_val_l_ to i64 + %_mult_tmp = shl nsw i64 %_conv, 3 + %_sub_tmp4 = sub nuw nsw i64 -8, %_mult_tmp + %_val_n_ = load i32, i32* %.n, align 4 + %_leq_tmp.not116 = icmp slt i32 %_val_n_, 1 + br i1 %_leq_tmp.not116, label %_return_bb, label %_loop_1_do_.lr.ph + +_loop_1_do_.lr.ph: ; preds = %entry + %_val_m_ = load i32, i32* %.m, align 4 + %_leq_tmp6.not114 = icmp slt i32 %_val_m_, 1 + br i1 %_leq_tmp6.not114, label %_return_bb, label %_loop_1_do_.preheader + +_loop_1_do_.preheader: ; preds = %_loop_1_do_.lr.ph + %x_rvo_based_addr_112 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1 + %a_byte_ptr_ = bitcast [0 x %_elem_type_of_a]* %.a to i8* + %a_rvo_based_addr_ = getelementptr inbounds i8, i8* %a_byte_ptr_, i64 %_sub_tmp4 + %.vy01.promoted = load <2 x double>, <2 x double>* %.vy01, align 16 + %.vy02.promoted = load <2 x double>, <2 x double>* %.vy02, align 16 + %.vy03.promoted = load <2 x double>, <2 x double>* %.vy03, align 16 + %.vy04.promoted = load <2 x double>, <2 x double>* %.vy04, align 16 + %.vy05.promoted = load <2 x double>, <2 x double>* %.vy05, align 16 + %.vy06.promoted = load <2 x double>, <2 x double>* %.vy06, align 16 + %.vy07.promoted = load <2 x double>, <2 x double>* %.vy07, align 16 + %.vy08.promoted = load <2 x double>, <2 x double>* %.vy08, align 16 + %.vy09.promoted = load <2 x double>, <2 x double>* %.vy09, align 16 + %.vy0a.promoted = load <2 x double>, <2 x double>* %.vy0a, align 16 + %.vy0b.promoted = load <2 x double>, <2 x double>* %.vy0b, align 16 + %.vy0c.promoted = load <2 x double>, <2 x double>* %.vy0c, align 16 + %.vy21.promoted = load <2 x double>, <2 x double>* %.vy21, align 16 + %.vy22.promoted = load <2 x double>, <2 x double>* %.vy22, align 16 + %.vy23.promoted = load <2 x double>, <2 x double>* %.vy23, align 16 + %.vy24.promoted = load <2 x double>, <2 x double>* %.vy24, align 16 + %.vy25.promoted = load <2 x double>, <2 x double>* %.vy25, align 16 + %.vy26.promoted = load <2 x double>, <2 x double>* %.vy26, align 16 + %.vy27.promoted = load <2 x double>, <2 x double>* %.vy27, align 16 + %.vy28.promoted = load <2 x double>, <2 x double>* %.vy28, align 16 + %.vy29.promoted = load <2 x double>, <2 x double>* %.vy29, align 16 + %.vy2a.promoted = load <2 x double>, <2 x double>* %.vy2a, align 16 + %.vy2b.promoted = load <2 x double>, <2 x double>* %.vy2b, align 16 + %.vy2c.promoted = load <2 x double>, <2 x double>* %.vy2c, align 16 + %0 = zext i32 %_val_m_ to i64 + %1 = zext i32 %_val_n_ to i64 + br label %_loop_2_do_.lr.ph + +_loop_2_do_.lr.ph: ; preds = %_loop_2_endl_, %_loop_1_do_.preheader + %indvars.iv212 = phi i64 [ %indvars.iv.next213, %_loop_2_endl_ ], [ 1, %_loop_1_do_.preheader ] + %2 = phi <2 x double> [ %142, %_loop_2_endl_ ], [ %.vy2c.promoted, %_loop_1_do_.preheader ] + %3 = phi <2 x double> [ %140, %_loop_2_endl_ ], [ %.vy2b.promoted, %_loop_1_do_.preheader ] + %4 = phi <2 x double> [ %138, %_loop_2_endl_ ], [ %.vy2a.promoted, %_loop_1_do_.preheader ] + %5 = phi <2 x double> [ %136, %_loop_2_endl_ ], [ %.vy29.promoted, %_loop_1_do_.preheader ] + %6 = phi <2 x double> [ %134, %_loop_2_endl_ ], [ %.vy28.promoted, %_loop_1_do_.preheader ] + %7 = phi <2 x double> [ %132, %_loop_2_endl_ ], [ %.vy27.promoted, %_loop_1_do_.preheader ] + %8 = phi <2 x double> [ %129, %_loop_2_endl_ ], [ %.vy26.promoted, %_loop_1_do_.preheader ] + %9 = phi <2 x double> [ %127, %_loop_2_endl_ ], [ %.vy25.promoted, %_loop_1_do_.preheader ] + %10 = phi <2 x double> [ %125, %_loop_2_endl_ ], [ %.vy24.promoted, %_loop_1_do_.preheader ] + %11 = phi <2 x double> [ %123, %_loop_2_endl_ ], [ %.vy23.promoted, %_loop_1_do_.preheader ] + %12 = phi <2 x double> [ %121, %_loop_2_endl_ ], [ %.vy22.promoted, %_loop_1_do_.preheader ] + %13 = phi <2 x double> [ %119, %_loop_2_endl_ ], [ %.vy21.promoted, %_loop_1_do_.preheader ] + %14 = phi <2 x double> [ %116, %_loop_2_endl_ ], [ %.vy0c.promoted, %_loop_1_do_.preheader ] + %15 = phi <2 x double> [ %114, %_loop_2_endl_ ], [ %.vy0b.promoted, %_loop_1_do_.preheader ] + %16 = phi <2 x double> [ %112, %_loop_2_endl_ ], [ %.vy0a.promoted, %_loop_1_do_.preheader ] + %17 = phi <2 x double> [ %110, %_loop_2_endl_ ], [ %.vy09.promoted, %_loop_1_do_.preheader ] + %18 = phi <2 x double> [ %108, %_loop_2_endl_ ], [ %.vy08.promoted, %_loop_1_do_.preheader ] + %19 = phi <2 x double> [ %106, %_loop_2_endl_ ], [ %.vy07.promoted, %_loop_1_do_.preheader ] + %20 = phi <2 x double> [ %81, %_loop_2_endl_ ], [ %.vy06.promoted, %_loop_1_do_.preheader ] + %21 = phi <2 x double> [ %79, %_loop_2_endl_ ], [ %.vy05.promoted, %_loop_1_do_.preheader ] + %22 = phi <2 x double> [ %77, %_loop_2_endl_ ], [ %.vy04.promoted, %_loop_1_do_.preheader ] + %23 = phi <2 x double> [ %75, %_loop_2_endl_ ], [ %.vy03.promoted, %_loop_1_do_.preheader ] + %24 = phi <2 x double> [ %73, %_loop_2_endl_ ], [ %.vy02.promoted, %_loop_1_do_.preheader ] + %25 = phi <2 x double> [ %71, %_loop_2_endl_ ], [ %.vy01.promoted, %_loop_1_do_.preheader ] + %_ix_x_len10 = mul i64 %_mult_tmp, %indvars.iv212 + %a_ix_dim_0_ = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len10 + %26 = add nuw nsw i64 %indvars.iv212, 1 + %_ix_x_len24 = mul i64 %_mult_tmp, %26 + %a_ix_dim_0_25 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len24 + %27 = add nuw nsw i64 %indvars.iv212, 2 + %_ix_x_len40 = mul i64 %_mult_tmp, %27 + %a_ix_dim_0_41 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len40 + %28 = add nuw nsw i64 %indvars.iv212, 3 + %_ix_x_len56 = mul i64 %_mult_tmp, %28 + %a_ix_dim_0_57 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len56 + %29 = add nuw nsw i64 %indvars.iv212, 4 + %_ix_x_len72 = mul i64 %_mult_tmp, %29 + %a_ix_dim_0_73 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len72 + %30 = add nuw nsw i64 %indvars.iv212, 5 + %_ix_x_len88 = mul i64 %_mult_tmp, %30 + %a_ix_dim_0_89 = getelementptr inbounds i8, i8* %a_rvo_based_addr_, i64 %_ix_x_len88 + br label %_loop_2_do_ + +_loop_2_do_: ; preds = %_loop_2_do_.lr.ph, %_loop_2_do_ + %indvars.iv = phi i64 [ 1, %_loop_2_do_.lr.ph ], [ %indvars.iv.next, %_loop_2_do_ ] + %31 = phi <2 x double> [ %2, %_loop_2_do_.lr.ph ], [ %142, %_loop_2_do_ ] + %32 = phi <2 x double> [ %3, %_loop_2_do_.lr.ph ], [ %140, %_loop_2_do_ ] + %33 = phi <2 x double> [ %4, %_loop_2_do_.lr.ph ], [ %138, %_loop_2_do_ ] + %34 = phi <2 x double> [ %5, %_loop_2_do_.lr.ph ], [ %136, %_loop_2_do_ ] + %35 = phi <2 x double> [ %6, %_loop_2_do_.lr.ph ], [ %134, %_loop_2_do_ ] + %36 = phi <2 x double> [ %7, %_loop_2_do_.lr.ph ], [ %132, %_loop_2_do_ ] + %37 = phi <2 x double> [ %8, %_loop_2_do_.lr.ph ], [ %129, %_loop_2_do_ ] + %38 = phi <2 x double> [ %9, %_loop_2_do_.lr.ph ], [ %127, %_loop_2_do_ ] + %39 = phi <2 x double> [ %10, %_loop_2_do_.lr.ph ], [ %125, %_loop_2_do_ ] + %40 = phi <2 x double> [ %11, %_loop_2_do_.lr.ph ], [ %123, %_loop_2_do_ ] + %41 = phi <2 x double> [ %12, %_loop_2_do_.lr.ph ], [ %121, %_loop_2_do_ ] + %42 = phi <2 x double> [ %13, %_loop_2_do_.lr.ph ], [ %119, %_loop_2_do_ ] + %43 = phi <2 x double> [ %14, %_loop_2_do_.lr.ph ], [ %116, %_loop_2_do_ ] + %44 = phi <2 x double> [ %15, %_loop_2_do_.lr.ph ], [ %114, %_loop_2_do_ ] + %45 = phi <2 x double> [ %16, %_loop_2_do_.lr.ph ], [ %112, %_loop_2_do_ ] + %46 = phi <2 x double> [ %17, %_loop_2_do_.lr.ph ], [ %110, %_loop_2_do_ ] + %47 = phi <2 x double> [ %18, %_loop_2_do_.lr.ph ], [ %108, %_loop_2_do_ ] + %48 = phi <2 x double> [ %19, %_loop_2_do_.lr.ph ], [ %106, %_loop_2_do_ ] + %49 = phi <2 x double> [ %20, %_loop_2_do_.lr.ph ], [ %81, %_loop_2_do_ ] + %50 = phi <2 x double> [ %21, %_loop_2_do_.lr.ph ], [ %79, %_loop_2_do_ ] + %51 = phi <2 x double> [ %22, %_loop_2_do_.lr.ph ], [ %77, %_loop_2_do_ ] + %52 = phi <2 x double> [ %23, %_loop_2_do_.lr.ph ], [ %75, %_loop_2_do_ ] + %53 = phi <2 x double> [ %24, %_loop_2_do_.lr.ph ], [ %73, %_loop_2_do_ ] + %54 = phi <2 x double> [ %25, %_loop_2_do_.lr.ph ], [ %71, %_loop_2_do_ ] + %_ix_x_len = shl nuw nsw i64 %indvars.iv, 3 + %x_ix_dim_0_113 = getelementptr inbounds %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_112, i64 %indvars.iv + %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_113 to i8* + %55 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %x_ix_dim_0_) + %a_ix_dim_1_ = getelementptr inbounds i8, i8* %a_ix_dim_0_, i64 %_ix_x_len + %56 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_) + %a_ix_dim_1_29 = getelementptr inbounds i8, i8* %a_ix_dim_0_25, i64 %_ix_x_len + %57 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_29) + %a_ix_dim_1_45 = getelementptr inbounds i8, i8* %a_ix_dim_0_41, i64 %_ix_x_len + %58 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_45) + %a_ix_dim_1_61 = getelementptr inbounds i8, i8* %a_ix_dim_0_57, i64 %_ix_x_len + %59 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_61) + %a_ix_dim_1_77 = getelementptr inbounds i8, i8* %a_ix_dim_0_73, i64 %_ix_x_len + %60 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_77) + %a_ix_dim_1_93 = getelementptr inbounds i8, i8* %a_ix_dim_0_89, i64 %_ix_x_len + %61 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* nonnull %a_ix_dim_1_93) + %62 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %55) + %.fca.0.extract35 = extractvalue { <16 x i8>, <16 x i8> } %62, 0 + %.fca.1.extract36 = extractvalue { <16 x i8>, <16 x i8> } %62, 1 + %63 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %56) + %.fca.0.extract29 = extractvalue { <16 x i8>, <16 x i8> } %63, 0 + %.fca.1.extract30 = extractvalue { <16 x i8>, <16 x i8> } %63, 1 + %64 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %57) + %.fca.0.extract23 = extractvalue { <16 x i8>, <16 x i8> } %64, 0 + %.fca.1.extract24 = extractvalue { <16 x i8>, <16 x i8> } %64, 1 + %65 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %58) + %.fca.0.extract17 = extractvalue { <16 x i8>, <16 x i8> } %65, 0 + %.fca.1.extract18 = extractvalue { <16 x i8>, <16 x i8> } %65, 1 + %66 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %59) + %.fca.0.extract11 = extractvalue { <16 x i8>, <16 x i8> } %66, 0 + %.fca.1.extract12 = extractvalue { <16 x i8>, <16 x i8> } %66, 1 + %67 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %60) + %.fca.0.extract5 = extractvalue { <16 x i8>, <16 x i8> } %67, 0 + %.fca.1.extract6 = extractvalue { <16 x i8>, <16 x i8> } %67, 1 + %68 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %61) + %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %68, 0 + %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %68, 1 + %69 = bitcast <16 x i8> %.fca.0.extract29 to <2 x double> + %70 = bitcast <16 x i8> %.fca.0.extract35 to <2 x double> + %71 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %69, <2 x double> %70, <2 x double> %54) + %72 = bitcast <16 x i8> %.fca.0.extract23 to <2 x double> + %73 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %72, <2 x double> %70, <2 x double> %53) + %74 = bitcast <16 x i8> %.fca.0.extract17 to <2 x double> + %75 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %74, <2 x double> %70, <2 x double> %52) + %76 = bitcast <16 x i8> %.fca.0.extract11 to <2 x double> + %77 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %76, <2 x double> %70, <2 x double> %51) + %78 = bitcast <16 x i8> %.fca.0.extract5 to <2 x double> + %79 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %78, <2 x double> %70, <2 x double> %50) + %80 = bitcast <16 x i8> %.fca.0.extract to <2 x double> + %81 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %80, <2 x double> %70, <2 x double> %49) + %82 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_ix_dim_0_113, i64 4 + %83 = bitcast %_elem_type_of_x* %82 to i8* + %84 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %83) + %85 = getelementptr i8, i8* %a_ix_dim_1_, i64 32 + %86 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %85) + %87 = getelementptr i8, i8* %a_ix_dim_1_29, i64 32 + %88 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %87) + %89 = getelementptr i8, i8* %a_ix_dim_1_45, i64 32 + %90 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %89) + %91 = getelementptr i8, i8* %a_ix_dim_1_61, i64 32 + %92 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %91) + %93 = getelementptr i8, i8* %a_ix_dim_1_77, i64 32 + %94 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %93) + %95 = getelementptr i8, i8* %a_ix_dim_1_93, i64 32 + %96 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %95) + %97 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %84) + %.fca.0.extract37 = extractvalue { <16 x i8>, <16 x i8> } %97, 0 + %.fca.1.extract39 = extractvalue { <16 x i8>, <16 x i8> } %97, 1 + %98 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %86) + %.fca.0.extract31 = extractvalue { <16 x i8>, <16 x i8> } %98, 0 + %.fca.1.extract33 = extractvalue { <16 x i8>, <16 x i8> } %98, 1 + %99 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %88) + %.fca.0.extract25 = extractvalue { <16 x i8>, <16 x i8> } %99, 0 + %.fca.1.extract27 = extractvalue { <16 x i8>, <16 x i8> } %99, 1 + %100 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %90) + %.fca.0.extract19 = extractvalue { <16 x i8>, <16 x i8> } %100, 0 + %.fca.1.extract21 = extractvalue { <16 x i8>, <16 x i8> } %100, 1 + %101 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %92) + %.fca.0.extract13 = extractvalue { <16 x i8>, <16 x i8> } %101, 0 + %.fca.1.extract15 = extractvalue { <16 x i8>, <16 x i8> } %101, 1 + %102 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %94) + %.fca.0.extract7 = extractvalue { <16 x i8>, <16 x i8> } %102, 0 + %.fca.1.extract9 = extractvalue { <16 x i8>, <16 x i8> } %102, 1 + %103 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %96) + %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %103, 0 + %.fca.1.extract3 = extractvalue { <16 x i8>, <16 x i8> } %103, 1 + %104 = bitcast <16 x i8> %.fca.1.extract30 to <2 x double> + %105 = bitcast <16 x i8> %.fca.1.extract36 to <2 x double> + %106 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %104, <2 x double> %105, <2 x double> %48) + %107 = bitcast <16 x i8> %.fca.1.extract24 to <2 x double> + %108 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %107, <2 x double> %105, <2 x double> %47) + %109 = bitcast <16 x i8> %.fca.1.extract18 to <2 x double> + %110 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %109, <2 x double> %105, <2 x double> %46) + %111 = bitcast <16 x i8> %.fca.1.extract12 to <2 x double> + %112 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %111, <2 x double> %105, <2 x double> %45) + %113 = bitcast <16 x i8> %.fca.1.extract6 to <2 x double> + %114 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %113, <2 x double> %105, <2 x double> %44) + %115 = bitcast <16 x i8> %.fca.1.extract to <2 x double> + %116 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %115, <2 x double> %105, <2 x double> %43) + %117 = bitcast <16 x i8> %.fca.0.extract31 to <2 x double> + %118 = bitcast <16 x i8> %.fca.0.extract37 to <2 x double> + %119 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %117, <2 x double> %118, <2 x double> %42) + %120 = bitcast <16 x i8> %.fca.0.extract25 to <2 x double> + %121 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %120, <2 x double> %118, <2 x double> %41) + %122 = bitcast <16 x i8> %.fca.0.extract19 to <2 x double> + %123 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %122, <2 x double> %118, <2 x double> %40) + %124 = bitcast <16 x i8> %.fca.0.extract13 to <2 x double> + %125 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %124, <2 x double> %118, <2 x double> %39) + %126 = bitcast <16 x i8> %.fca.0.extract7 to <2 x double> + %127 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %126, <2 x double> %118, <2 x double> %38) + %128 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double> + %129 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %128, <2 x double> %118, <2 x double> %37) + %130 = bitcast <16 x i8> %.fca.1.extract33 to <2 x double> + %131 = bitcast <16 x i8> %.fca.1.extract39 to <2 x double> + %132 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %130, <2 x double> %131, <2 x double> %36) + %133 = bitcast <16 x i8> %.fca.1.extract27 to <2 x double> + %134 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %133, <2 x double> %131, <2 x double> %35) + %135 = bitcast <16 x i8> %.fca.1.extract21 to <2 x double> + %136 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %135, <2 x double> %131, <2 x double> %34) + %137 = bitcast <16 x i8> %.fca.1.extract15 to <2 x double> + %138 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %137, <2 x double> %131, <2 x double> %33) + %139 = bitcast <16 x i8> %.fca.1.extract9 to <2 x double> + %140 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %139, <2 x double> %131, <2 x double> %32) + %141 = bitcast <16 x i8> %.fca.1.extract3 to <2 x double> + %142 = tail call contract <2 x double> @llvm.fma.v2f64(<2 x double> %141, <2 x double> %131, <2 x double> %31) + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8 + %_leq_tmp6.not = icmp ugt i64 %indvars.iv.next, %0 + br i1 %_leq_tmp6.not, label %_loop_2_endl_, label %_loop_2_do_ + +_loop_2_endl_: ; preds = %_loop_2_do_ + %indvars.iv.next213 = add nuw nsw i64 %indvars.iv212, 6 + %_leq_tmp.not = icmp ugt i64 %indvars.iv.next213, %1 + br i1 %_leq_tmp.not, label %_loop_1_loopHeader_._return_bb_crit_edge.loopexit, label %_loop_2_do_.lr.ph + +_loop_1_loopHeader_._return_bb_crit_edge.loopexit: ; preds = %_loop_2_endl_ + store <2 x double> %71, <2 x double>* %.vy01, align 16 + store <2 x double> %73, <2 x double>* %.vy02, align 16 + store <2 x double> %75, <2 x double>* %.vy03, align 16 + store <2 x double> %77, <2 x double>* %.vy04, align 16 + store <2 x double> %79, <2 x double>* %.vy05, align 16 + store <2 x double> %81, <2 x double>* %.vy06, align 16 + store <2 x double> %106, <2 x double>* %.vy07, align 16 + store <2 x double> %108, <2 x double>* %.vy08, align 16 + store <2 x double> %110, <2 x double>* %.vy09, align 16 + store <2 x double> %112, <2 x double>* %.vy0a, align 16 + store <2 x double> %114, <2 x double>* %.vy0b, align 16 + store <2 x double> %116, <2 x double>* %.vy0c, align 16 + store <2 x double> %119, <2 x double>* %.vy21, align 16 + store <2 x double> %121, <2 x double>* %.vy22, align 16 + store <2 x double> %123, <2 x double>* %.vy23, align 16 + store <2 x double> %125, <2 x double>* %.vy24, align 16 + store <2 x double> %127, <2 x double>* %.vy25, align 16 + store <2 x double> %129, <2 x double>* %.vy26, align 16 + store <2 x double> %132, <2 x double>* %.vy27, align 16 + store <2 x double> %134, <2 x double>* %.vy28, align 16 + store <2 x double> %136, <2 x double>* %.vy29, align 16 + store <2 x double> %138, <2 x double>* %.vy2a, align 16 + store <2 x double> %140, <2 x double>* %.vy2b, align 16 + store <2 x double> %142, <2 x double>* %.vy2c, align 16 + br label %_return_bb + +_return_bb: ; preds = %_loop_1_do_.lr.ph, %_loop_1_loopHeader_._return_bb_crit_edge.loopexit, %entry + ret void +} + declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) +declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll b/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll --- a/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/paired-vector-intrinsics.ll @@ -51,6 +51,42 @@ ret void } +; assemble_pair +declare <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8>, <16 x i8>) +define void @ass_pair1(<256 x i1>* %ptr, <16 x i8> %vc) { +; CHECK-LABEL: ass_pair1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmr v3, v2 +; CHECK-NEXT: stxv v2, 16(r3) +; CHECK-NEXT: stxv v3, 0(r3) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: ass_pair1: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: vmr v3, v2 +; CHECK-NOMMA-NEXT: stxv v2, 16(r3) +; CHECK-NOMMA-NEXT: stxv v3, 0(r3) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: ass_pair1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: vmr v3, v2 +; CHECK-BE-NEXT: stxv v2, 16(r3) +; CHECK-BE-NEXT: stxv v2, 0(r3) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: ass_pair1: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: vmr v3, v2 +; CHECK-BE-NOMMA-NEXT: stxv v2, 16(r3) +; CHECK-BE-NOMMA-NEXT: stxv v2, 0(r3) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = tail call <256 x i1> @llvm.ppc.mma.assemble.pair(<16 x i8> %vc, <16 x i8> %vc) + store <256 x i1> %0, <256 x i1>* %ptr, align 32 + ret void +} + ; disassemble_pair declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>) define void @disass_pair(<256 x i1>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3) { @@ -95,6 +131,50 @@ ret void } +; disassemble_pair +declare { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1>) +define void @disass_pair1(<256 x i1>* %ptr1, <16 x i8>* %ptr2, <16 x i8>* %ptr3) { +; CHECK-LABEL: disass_pair1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxv vs1, 0(r3) +; CHECK-NEXT: lxv vs0, 16(r3) +; CHECK-NEXT: stxv vs1, 0(r4) +; CHECK-NEXT: stxv vs0, 0(r5) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: disass_pair1: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxv vs1, 0(r3) +; CHECK-NOMMA-NEXT: lxv vs0, 16(r3) +; CHECK-NOMMA-NEXT: stxv vs1, 0(r4) +; CHECK-NOMMA-NEXT: stxv vs0, 0(r5) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: disass_pair1: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r4) +; CHECK-BE-NEXT: stxv vs1, 0(r5) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: disass_pair1: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxv vs1, 16(r3) +; CHECK-BE-NOMMA-NEXT: lxv vs0, 0(r3) +; CHECK-BE-NOMMA-NEXT: stxv vs0, 0(r4) +; CHECK-BE-NOMMA-NEXT: stxv vs1, 0(r5) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = load <256 x i1>, <256 x i1>* %ptr1, align 32 + %1 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.pair(<256 x i1> %0) + %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0 + %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1 + store <16 x i8> %2, <16 x i8>* %ptr2, align 16 + store <16 x i8> %3, <16 x i8>* %ptr3, align 16 + ret void +} + define void @test_ldst_1(<256 x i1>* %vpp, <256 x i1>* %vp2) { ; CHECK-LABEL: test_ldst_1: ; CHECK: # %bb.0: # %entry @@ -127,8 +207,42 @@ ret void } +define void @test_ldst_1a(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_1a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 0(r3) +; CHECK-NEXT: stxvp vsp0, 0(r4) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_1a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxvp vsp0, 0(r3) +; CHECK-NOMMA-NEXT: stxvp vsp0, 0(r4) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_1a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 0(r3) +; CHECK-BE-NEXT: stxvp vsp0, 0(r4) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_1a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxvp vsp0, 0(r3) +; CHECK-BE-NOMMA-NEXT: stxvp vsp0, 0(r4) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %0) + %2 = bitcast <256 x i1>* %vp2 to i8* + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %1, i8* %2) + ret void +} + declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*) +declare <256 x i1> @llvm.ppc.mma.lxvp(i8*) declare void @llvm.ppc.vsx.stxvp(<256 x i1>, i8*) +declare void @llvm.ppc.mma.stxvp(<256 x i1>, i8*) define void @test_ldst_2(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) { ; CHECK-LABEL: test_ldst_2: @@ -164,6 +278,40 @@ ret void } +define void @test_ldst_2a(<256 x i1>* %vpp, i64 %offset, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_2a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvpx vsp0, r3, r4 +; CHECK-NEXT: stxvpx vsp0, r5, r4 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_2a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r4 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r5, r4 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_2a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvpx vsp0, r3, r4 +; CHECK-BE-NEXT: stxvpx vsp0, r5, r4 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_2a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r4 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r5, r4 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 %offset + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 %offset + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + define void @test_ldst_3(<256 x i1>* %vpp, <256 x i1>* %vp2) { ; CHECK-LABEL: test_ldst_3: ; CHECK: # %bb.0: # %entry @@ -202,6 +350,44 @@ ret void } +define void @test_ldst_3a(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_3a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 18 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_3a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 18 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_3a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 18 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_3a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 18 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 18 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 18 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + define void @test_ldst_4(<256 x i1>* %vpp, <256 x i1>* %vp2) { ; CHECK-LABEL: test_ldst_4: ; CHECK: # %bb.0: # %entry @@ -240,6 +426,44 @@ ret void } +define void @test_ldst_4a(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_4a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 1 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_4a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 1 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_4a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 1 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_4a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 1 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 1 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 1 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + define void @test_ldst_5(<256 x i1>* %vpp, <256 x i1>* %vp2) { ; CHECK-LABEL: test_ldst_5: ; CHECK: # %bb.0: # %entry @@ -278,6 +502,44 @@ ret void } +define void @test_ldst_5a(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_5a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li r5, 42 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_5a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: li r5, 42 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_5a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: li r5, 42 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_5a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: li r5, 42 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 42 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 42 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + define void @test_ldst_6(<256 x i1>* %vpp, <256 x i1>* %vp2) { ; CHECK-LABEL: test_ldst_6: ; CHECK: # %bb.0: # %entry @@ -312,6 +574,40 @@ ret void } +define void @test_ldst_6a(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; CHECK-LABEL: test_ldst_6a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lxvp vsp0, 4096(r3) +; CHECK-NEXT: stxvp vsp0, 4096(r4) +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_6a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: lxvp vsp0, 4096(r3) +; CHECK-NOMMA-NEXT: stxvp vsp0, 4096(r4) +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_6a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: lxvp vsp0, 4096(r3) +; CHECK-BE-NEXT: stxvp vsp0, 4096(r4) +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_6a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: lxvp vsp0, 4096(r3) +; CHECK-BE-NOMMA-NEXT: stxvp vsp0, 4096(r4) +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = getelementptr <256 x i1>, <256 x i1>* %vpp, i64 128 + %1 = bitcast <256 x i1>* %0 to i8* + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = getelementptr <256 x i1>, <256 x i1>* %vp2, i64 128 + %4 = bitcast <256 x i1>* %3 to i8* + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +} + define void @test_ldst_7(<256 x i1>* %vpp, <256 x i1>* %vp2) { ; FIXME: A prefixed load (plxvp) is expected here as the offset in this ; test case is a constant that fits within 34-bits. @@ -351,3 +647,43 @@ tail call void @llvm.ppc.vsx.stxvp(<256 x i1> %2, i8* %4) ret void } + +define void @test_ldst_7a(<256 x i1>* %vpp, <256 x i1>* %vp2) { +; FIXME: A prefixed load (plxvp) is expected here as the offset in this +; test case is a constant that fits within 34-bits. +; CHECK-LABEL: test_ldst_7a: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pli r5, 32799 +; CHECK-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NEXT: blr +; +; CHECK-NOMMA-LABEL: test_ldst_7a: +; CHECK-NOMMA: # %bb.0: # %entry +; CHECK-NOMMA-NEXT: pli r5, 32799 +; CHECK-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-NOMMA-NEXT: blr +; +; CHECK-BE-LABEL: test_ldst_7a: +; CHECK-BE: # %bb.0: # %entry +; CHECK-BE-NEXT: pli r5, 32799 +; CHECK-BE-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NEXT: blr +; +; CHECK-BE-NOMMA-LABEL: test_ldst_7a: +; CHECK-BE-NOMMA: # %bb.0: # %entry +; CHECK-BE-NOMMA-NEXT: pli r5, 32799 +; CHECK-BE-NOMMA-NEXT: lxvpx vsp0, r3, r5 +; CHECK-BE-NOMMA-NEXT: stxvpx vsp0, r4, r5 +; CHECK-BE-NOMMA-NEXT: blr +entry: + %0 = bitcast <256 x i1>* %vpp to i8* + %1 = getelementptr i8, i8* %0, i64 32799 + %2 = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* %1) + %3 = bitcast <256 x i1>* %vp2 to i8* + %4 = getelementptr i8, i8* %3, i64 32799 + tail call void @llvm.ppc.mma.stxvp(<256 x i1> %2, i8* %4) + ret void +}