Index: clang/include/clang/Basic/BuiltinsPPC.def =================================================================== --- clang/include/clang/Basic/BuiltinsPPC.def +++ clang/include/clang/Basic/BuiltinsPPC.def @@ -305,6 +305,12 @@ BUILTIN(__builtin_altivec_vextractdm, "UiV2ULLi", "") BUILTIN(__builtin_altivec_vextractqm, "UiV1ULLLi", "") +// P10 Vector Count with Mask built-ins. +BUILTIN(__builtin_altivec_vcntmbb, "ULLiV16UcUi", "") +BUILTIN(__builtin_altivec_vcntmbh, "ULLiV8UsUi", "") +BUILTIN(__builtin_altivec_vcntmbw, "ULLiV4UiUi", "") +BUILTIN(__builtin_altivec_vcntmbd, "ULLiV2ULLiUi", "") + // P10 Vector Parallel Bits built-ins. BUILTIN(__builtin_altivec_vpdepd, "V2ULLiV2ULLiV2ULLi", "") BUILTIN(__builtin_altivec_vpextd, "V2ULLiV2ULLiV2ULLi", "") Index: clang/lib/Headers/altivec.h =================================================================== --- clang/lib/Headers/altivec.h +++ clang/lib/Headers/altivec.h @@ -2297,6 +2297,20 @@ return __builtin_altivec_vctzd(__a); } +/* vec_cntm */ + +#ifdef __POWER10_VECTOR__ +#define vec_cntm(__a, __mp) \ + _Generic((__a), vector unsigned char \ + : __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)), \ + vector unsigned short \ + : __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)), \ + vector unsigned int \ + : __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)), \ + vector unsigned long long \ + : __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp))) +#endif /* __POWER10_VECTOR__ */ + /* vec_first_match_index */ static __inline__ unsigned __ATTRS_o_ai Index: clang/test/CodeGen/builtins-ppc-p10vector.c =================================================================== --- clang/test/CodeGen/builtins-ppc-p10vector.c +++ clang/test/CodeGen/builtins-ppc-p10vector.c @@ -43,6 +43,30 @@ return vec_extractm(vui128a); } +unsigned long long test_vec_cntm_uc(void) { + // CHECK: @llvm.ppc.altivec.vcntmbb(<16 x i8> %{{.+}}, i32 + // CHECK-NEXT: ret i64 + return vec_cntm(vuca, 1); +} + +unsigned long long test_vec_cntm_us(void) { + // CHECK: @llvm.ppc.altivec.vcntmbh(<8 x i16> %{{.+}}, i32 + // CHECK-NEXT: ret i64 + return vec_cntm(vusa, 0); +} + +unsigned long long test_vec_cntm_ui(void) { + // CHECK: @llvm.ppc.altivec.vcntmbw(<4 x i32> %{{.+}}, i32 + // CHECK-NEXT: ret i64 + return vec_cntm(vuia, 1); +} + +unsigned long long test_vec_cntm_ull(void) { + // CHECK: @llvm.ppc.altivec.vcntmbd(<2 x i64> %{{.+}}, i32 + // CHECK-NEXT: ret i64 + return vec_cntm(vulla, 0); +} + vector unsigned long long test_vpdepd(void) { // CHECK: @llvm.ppc.altivec.vpdepd(<2 x i64> // CHECK-NEXT: ret <2 x i64> Index: llvm/include/llvm/IR/IntrinsicsPowerPC.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsPowerPC.td +++ llvm/include/llvm/IR/IntrinsicsPowerPC.td @@ -435,6 +435,20 @@ def int_ppc_altivec_vextractqm : GCCBuiltin<"__builtin_altivec_vextractqm">, Intrinsic<[llvm_i32_ty], [llvm_v1i128_ty], [IntrNoMem]>; + // P10 Vector Count with Mask intrinsics. + def int_ppc_altivec_vcntmbb : GCCBuiltin<"__builtin_altivec_vcntmbb">, + Intrinsic<[llvm_i64_ty], [llvm_v16i8_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_altivec_vcntmbh : GCCBuiltin<"__builtin_altivec_vcntmbh">, + Intrinsic<[llvm_i64_ty], [llvm_v8i16_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_altivec_vcntmbw : GCCBuiltin<"__builtin_altivec_vcntmbw">, + Intrinsic<[llvm_i64_ty], [llvm_v4i32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + def int_ppc_altivec_vcntmbd : GCCBuiltin<"__builtin_altivec_vcntmbd">, + Intrinsic<[llvm_i64_ty], [llvm_v2i64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; + // P10 Vector Parallel Bits Deposit/Extract Doubleword Builtins. def int_ppc_altivec_vpdepd : GCCBuiltin<"__builtin_altivec_vpdepd">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], Index: llvm/lib/Target/PowerPC/PPCInstrPrefix.td =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -225,6 +225,29 @@ let Inst{21-31} = xo; } +// VX-Form: [PO VRT / UIM RB XO]. +// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent +// "/ UIM" (unused bit followed by a 4-bit immediate) +class VX_VRT5_UIM5_RB5 xo, string opc, list pattern> + : VXForm_1; + +class VXForm_RD5_MP_VB5 xo, bits<4> eo, dag OOL, dag IOL, + string asmstr, InstrItinClass itin, list pattern> + : I<4, OOL, IOL, asmstr, itin> { + bits<5> RD; + bits<5> VB; + bit MP; + + let Pattern = pattern; + + let Inst{6-10} = RD; + let Inst{11-14} = eo; + let Inst{15} = MP; + let Inst{16-20} = VB; + let Inst{21-31} = xo; +} + multiclass MLS_DForm_R_SI34_RTA5_MEM_p opcode, dag OOL, dag IOL, dag PCRel_IOL, string asmstr, InstrItinClass itin> { @@ -586,6 +609,26 @@ "vextractqm $rD, $vB", IIC_VecGeneral, [(set i32:$rD, (int_ppc_altivec_vextractqm v1i128:$vB))]>; + def VCNTMBB : VXForm_RD5_MP_VB5<1602, 12, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbb $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, + (int_ppc_altivec_vcntmbb v16i8:$vB, timm:$MP))]>; + def VCNTMBH : VXForm_RD5_MP_VB5<1602, 13, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbh $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, + (int_ppc_altivec_vcntmbh v8i16:$vB, timm:$MP))]>; + def VCNTMBW : VXForm_RD5_MP_VB5<1602, 14, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbw $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, + (int_ppc_altivec_vcntmbw v4i32:$vB, timm:$MP))]>; + def VCNTMBD : VXForm_RD5_MP_VB5<1602, 15, (outs g8rc:$rD), + (ins vrrc:$vB, u1imm:$MP), + "vcntmbd $rD, $vB, $MP", IIC_VecGeneral, + [(set i64:$rD, + (int_ppc_altivec_vcntmbd v2i64:$vB, timm:$MP))]>; def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), "vpdepd $vD, $vA, $vB", IIC_VecGeneral, [(set v2i64:$vD, Index: llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll =================================================================== --- llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll +++ llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll @@ -61,3 +61,48 @@ declare i32 @llvm.ppc.altivec.vextractwm(<4 x i32>) declare i32 @llvm.ppc.altivec.vextractdm(<2 x i64>) declare i32 @llvm.ppc.altivec.vextractqm(<1 x i128>) + +define i64 @test_vcntmbb(<16 x i8> %a) { +; CHECK-LABEL: test_vcntmbb: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbb r3, v2, 1 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbb(<16 x i8> %a, i32 1) + ret i64 %cnt +} + +define i64 @test_vcntmbh(<8 x i16> %a) { +; CHECK-LABEL: test_vcntmbh: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbh r3, v2, 0 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbh(<8 x i16> %a, i32 0) + ret i64 %cnt +} + +define i64 @test_vcntmbw(<4 x i32> %a) { +; CHECK-LABEL: test_vcntmbw: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbw r3, v2, 1 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbw(<4 x i32> %a, i32 1) + ret i64 %cnt +} + +define i64 @test_vcntmbd(<2 x i64> %a) { +; CHECK-LABEL: test_vcntmbd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vcntmbd r3, v2, 0 +; CHECK-NEXT: blr +entry: + %cnt = tail call i64 @llvm.ppc.altivec.vcntmbd(<2 x i64> %a, i32 0) + ret i64 %cnt +} + +declare i64 @llvm.ppc.altivec.vcntmbb(<16 x i8>, i32) +declare i64 @llvm.ppc.altivec.vcntmbh(<8 x i16>, i32) +declare i64 @llvm.ppc.altivec.vcntmbw(<4 x i32>, i32) +declare i64 @llvm.ppc.altivec.vcntmbd(<2 x i64>, i32) Index: llvm/test/MC/Disassembler/PowerPC/p10insts.txt =================================================================== --- llvm/test/MC/Disassembler/PowerPC/p10insts.txt +++ llvm/test/MC/Disassembler/PowerPC/p10insts.txt @@ -16,6 +16,18 @@ # CHECK: vextractqm 1, 2 0x10 0x2c 0x16 0x42 +# CHECK: vcntmbb 1, 2, 1 +0x10 0x39 0x16 0x42 + +# CHECK: vcntmbh 1, 2, 1 +0x10 0x3b 0x16 0x42 + +# CHECK: vcntmbw 1, 2, 0 +0x10 0x3c 0x16 0x42 + +# CHECK: vcntmbd 1, 2, 0 +0x10 0x3e 0x16 0x42 + # CHECK: vpdepd 1, 2, 0 0x10 0x22 0x05 0xcd Index: llvm/test/MC/PowerPC/p10.s =================================================================== --- llvm/test/MC/PowerPC/p10.s +++ llvm/test/MC/PowerPC/p10.s @@ -18,6 +18,18 @@ # CHECK-BE: vextractqm 1, 2 # encoding: [0x10,0x2c,0x16,0x42] # CHECK-LE: vextractqm 1, 2 # encoding: [0x42,0x16,0x2c,0x10] vextractqm 1, 2 +# CHECK-BE: vcntmbb 1, 2, 1 # encoding: [0x10,0x39,0x16,0x42] +# CHECK-LE: vcntmbb 1, 2, 1 # encoding: [0x42,0x16,0x39,0x10] + vcntmbb 1, 2, 1 +# CHECK-BE: vcntmbh 1, 2, 1 # encoding: [0x10,0x3b,0x16,0x42] +# CHECK-LE: vcntmbh 1, 2, 1 # encoding: [0x42,0x16,0x3b,0x10] + vcntmbh 1, 2, 1 +# CHECK-BE: vcntmbw 1, 2, 0 # encoding: [0x10,0x3c,0x16,0x42] +# CHECK-LE: vcntmbw 1, 2, 0 # encoding: [0x42,0x16,0x3c,0x10] + vcntmbw 1, 2, 0 +# CHECK-BE: vcntmbd 1, 2, 0 # encoding: [0x10,0x3e,0x16,0x42] +# CHECK-LE: vcntmbd 1, 2, 0 # encoding: [0x42,0x16,0x3e,0x10] + vcntmbd 1, 2, 0 # CHECK-BE: vpdepd 1, 2, 0 # encoding: [0x10,0x22,0x05,0xcd] # CHECK-LE: vpdepd 1, 2, 0 # encoding: [0xcd,0x05,0x22,0x10] vpdepd 1, 2, 0