Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -748,11 +748,17 @@ [IntrNoMem]>; } -// PCLMUL instruction +// PCLMUL instructions let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_pclmulqdq : GCCBuiltin<"__builtin_ia32_pclmulqdq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_pclmulqdq_256 : GCCBuiltin<"__builtin_ia32_pclmulqdq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], + [IntrNoMem]>; + def int_x86_pclmulqdq_512 : GCCBuiltin<"__builtin_ia32_pclmulqdq512">, + Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], + [IntrNoMem]>; } // Vector pack Index: llvm/trunk/lib/Support/Host.cpp =================================================================== --- llvm/trunk/lib/Support/Host.cpp +++ llvm/trunk/lib/Support/Host.cpp @@ -1262,6 +1262,9 @@ Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1); Features["vaes"] = HasLeaf7 && ((ECX >> 9) & 1) && HasAVXSave; + // VPCLMULQDQ (carry-less multiplication quadword) + Features["vpclmulqdq"] = HasLeaf7 && ((ECX >> 10) & 1) && HasAVXSave; + bool HasLeafD = MaxLevel >= 0xd && !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX); Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -160,6 +160,9 @@ def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; +def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true", + "Enable vpclmulqdq instructions", + [FeatureAVX, FeaturePCLMUL]>; def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", "Enable four-operand fused multiple-add", [FeatureAVX, FeatureSSE4A]>; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -10017,6 +10017,7 @@ //===----------------------------------------------------------------------===// // AES instructions //===----------------------------------------------------------------------===// + multiclass avx512_vaes Op, string OpStr, string IntPrefix> { let Predicates = [HasVLX, HasVAES] in { defm Z128 : AESI_binop_rm_int; defm VAESDECLAST : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">; +//===----------------------------------------------------------------------===// +// PCLMUL instructions - Carry less multiplication +//===----------------------------------------------------------------------===// + +let Predicates = [HasAVX512, HasVPCLMULQDQ] in +defm VPCLMULQDQZ : vpclmulqdq, + EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_WIG; + +let Predicates = [HasVLX, HasVPCLMULQDQ] in { +defm VPCLMULQDQZ128 : vpclmulqdq, + EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_WIG; + +defm VPCLMULQDQZ256: vpclmulqdq, EVEX_4V, EVEX_V256, + EVEX_CD8<64, CD8VF>, VEX_WIG; +} + +// Aliases +defm : vpclmulqdq_aliases<"VPCLMULQDQZ", VR512, i512mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQZ128", VR128X, i128mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>; + Index: llvm/trunk/lib/Target/X86/X86InstrFormats.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFormats.td +++ llvm/trunk/lib/Target/X86/X86InstrFormats.td @@ -854,13 +854,7 @@ // PCLMUL Instruction Templates class PCLMULIi8 o, Format F, dag outs, dag ins, string asm, listpattern, InstrItinClass itin = NoItinerary> - : Ii8, TAPD, - Requires<[NoAVX, HasPCLMUL]>; - -class AVXPCLMULIi8 o, Format F, dag outs, dag ins, string asm, - listpattern, InstrItinClass itin = NoItinerary> - : Ii8, TAPD, - VEX_4V, Requires<[HasAVX, HasPCLMUL]>; + : Ii8, TAPD; // FMA3 Instruction Templates class FMA3 o, Format F, dag outs, dag ins, string asm, Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -5255,7 +5255,11 @@ OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: - case X86::VPCLMULQDQrr:{ + case X86::VPCLMULQDQrr: + case X86::VPCLMULQDQYrr: + case X86::VPCLMULQDQZrr: + case X86::VPCLMULQDQZ128rr: + case X86::VPCLMULQDQZ256rr: { // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] unsigned Imm = MI.getOperand(3).getImm(); Index: llvm/trunk/lib/Target/X86/X86InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.td +++ llvm/trunk/lib/Target/X86/X86InstrInfo.td @@ -843,6 +843,9 @@ def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def NoVLX_Or_NoVPCLMULQDQ : + Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">; +def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -7242,40 +7242,84 @@ // PCLMUL Instructions //===----------------------------------------------------------------------===// -// AVX carry-less Multiplication instructions -let isCommutable = 1 in -def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, - Sched<[WriteCLMul]>, VEX_WIG; - -def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (loadv2i64 addr:$src2), imm:$src3))]>, - Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG; +// SSE carry-less Multiplication instructions +let Constraints = "$src1 = $dst", Predicates = [NoAVX, HasPCLMUL] in { + let isCommutable = 1 in + def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], + IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; + + def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), + imm:$src3))], + IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMulLd, ReadAfterLd]>; +} + +// SSE aliases +foreach HI = ["hq","lq"] in +foreach LO = ["hq","lq"] in { + def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", + (PCLMULQDQrr VR128:$dst, VR128:$src, + !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; + def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", + (PCLMULQDQrm VR128:$dst, i128mem:$src, + !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; +} -// Carry-less Multiplication instructions -let Constraints = "$src1 = $dst" in { -let isCommutable = 1 in -def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], - IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; - -def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (memopv2i64 addr:$src2), imm:$src3))], - IIC_SSE_PCLMULQDQ_RM>, - Sched<[WriteCLMulLd, ReadAfterLd]>; -} // Constraints = "$src1 = $dst" +// AVX carry-less Multiplication instructions +multiclass vpclmulqdq { + let isCommutable = 1 in + def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set RC:$dst, + (IntId RC:$src1, RC:$src2, imm:$src3))]>, + Sched<[WriteCLMul]>; + + def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, MemOp:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set RC:$dst, + (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, + Sched<[WriteCLMulLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in +defm VPCLMULQDQ : vpclmulqdq, VEX_4V, VEX_WIG; + +let Predicates = [NoVLX, HasVPCLMULQDQ] in +defm VPCLMULQDQY : vpclmulqdq, VEX_4V, VEX_L, VEX_WIG; + +multiclass vpclmulqdq_aliases_impl { + def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, + !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; + def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, + !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; +} + +multiclass vpclmulqdq_aliases { + defm : vpclmulqdq_aliases_impl; + defm : vpclmulqdq_aliases_impl; + defm : vpclmulqdq_aliases_impl; + defm : vpclmulqdq_aliases_impl; +} + +// AVX aliases +defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; // Immediate transform to help with commuting. def PCLMULCommuteImm : SDNodeXForm; } -multiclass pclmul_alias { - def : InstAlias; - - def : InstAlias; - - def : InstAlias; - - def : InstAlias; -} -defm : pclmul_alias<"hqhq", 0x11>; -defm : pclmul_alias<"hqlq", 0x01>; -defm : pclmul_alias<"lqhq", 0x10>; -defm : pclmul_alias<"lqlq", 0x00>; - //===----------------------------------------------------------------------===// // SSE4A Instructions //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -126,6 +126,7 @@ /// Target has carry-less multiplication bool HasPCLMUL; + bool HasVPCLMULQDQ; /// Target has 3-operand fused multiply-add bool HasFMA; @@ -465,6 +466,7 @@ bool hasXSAVEC() const { return HasXSAVEC; } bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } + bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; } // Prefer FMA4 to FMA - its better for commutation/memory folding and // has equal or better performance on all supported targets. bool hasFMA() const { return HasFMA && !HasFMA4; } Index: llvm/trunk/lib/Target/X86/X86Subtarget.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.cpp +++ llvm/trunk/lib/Target/X86/X86Subtarget.cpp @@ -299,6 +299,7 @@ HasXSAVEC = false; HasXSAVES = false; HasPCLMUL = false; + HasVPCLMULQDQ = false; HasFMA = false; HasFMA4 = false; HasXOP = false; Index: llvm/trunk/test/CodeGen/X86/avx-vpclmulqdq.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-vpclmulqdq.ll +++ llvm/trunk/test/CodeGen/X86/avx-vpclmulqdq.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx,vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX_VPCLMULQDQ + +; Check for vpclmulqdq +define <4 x i64> @test_x86_pclmulqdq(<4 x i64> %a0, <4 x i64> %a1) { +; AVX_VPCLMULQDQ-LABEL: test_x86_pclmulqdq: +; AVX_VPCLMULQDQ: # BB#0: +; AVX_VPCLMULQDQ-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x44,0xc1,0x11] +; AVX_VPCLMULQDQ-NEXT: retl # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 17) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone + Index: llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll +++ llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX512_VPCLMULQDQ + +define <8 x i64> @test_x86_pclmulqdq(<8 x i64> %a0, <8 x i64> %a1) { +; AVX512_VPCLMULQDQ-LABEL: test_x86_pclmulqdq: +; AVX512_VPCLMULQDQ: # BB#0: +; AVX512_VPCLMULQDQ-NEXT: vpclmulqdq $1, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x44,0xc1,0x01] +; AVX512_VPCLMULQDQ-NEXT: retq # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 1) + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64>, <8 x i64>, i8) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/avx512vl-vpclmulqdq.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-vpclmulqdq.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-vpclmulqdq.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL_VPCLMULQDQ + +define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) { +; AVX512VL_VPCLMULQDQ-LABEL: test_x86_pclmulqdq: +; AVX512VL_VPCLMULQDQ: # BB#0: +; AVX512VL_VPCLMULQDQ-NEXT: vpclmulqdq $1, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x01] +; AVX512VL_VPCLMULQDQ-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 1) + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone + +define <4 x i64> @test_x86_pclmulqdq_256(<4 x i64> %a0, <4 x i64> %a1) { +; AVX512VL_VPCLMULQDQ-LABEL: test_x86_pclmulqdq_256: +; AVX512VL_VPCLMULQDQ: # BB#0: +; AVX512VL_VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x44,0xc1,0x10] +; AVX512VL_VPCLMULQDQ-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 16) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone Index: llvm/trunk/test/CodeGen/X86/commute-vpclmulqdq-avx.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/commute-vpclmulqdq-avx.ll +++ llvm/trunk/test/CodeGen/X86/commute-vpclmulqdq-avx.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+vpclmulqdq | FileCheck %s +; FIXME: actual vpclmulqdq operation should be eliminated + +declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone + +define <4 x i64> @commute_v1(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: commute_v1: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $0, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 0) + %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 0) + %3 = xor <4 x i64> %1, %2 + ret <4 x i64> %3 +} + +define <4 x i64> @commute_v2(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: commute_v2: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 16) + %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 1) + %3 = xor <4 x i64> %2, %1 + ret <4 x i64> %3 +} + +define <4 x i64> @commute_v3(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: commute_v3: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 17) + %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 17) + %3 = xor <4 x i64> %2, %1 + ret <4 x i64> %3 +} + Index: llvm/trunk/test/CodeGen/X86/commute-vpclmulqdq-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/commute-vpclmulqdq-avx512.ll +++ llvm/trunk/test/CodeGen/X86/commute-vpclmulqdq-avx512.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+vpclmulqdq,+avx512vl | FileCheck %s +; FIXME: actual vpclmulqdq operation should be eliminated + +declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone +declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone +declare <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64>, <8 x i64>, i8) nounwind readnone + +define <2 x i64> @commute_xmm_v1(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: commute_xmm_v1: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) + %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a1, <2 x i64> %a0, i8 0) + %3 = xor <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <2 x i64> @commute_xmm_v2(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: commute_xmm_v2: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $16, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 16) + %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a1, <2 x i64> %a0, i8 1) + %3 = xor <2 x i64> %2, %1 + ret <2 x i64> %3 +} + +define <2 x i64> @commute_xmm_v3(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: commute_xmm_v3: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 17) + %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a1, <2 x i64> %a0, i8 17) + %3 = xor <2 x i64> %2, %1 + ret <2 x i64> %3 +} + +define <4 x i64> @commute_ymm_v1(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: commute_ymm_v1: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $0, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 0) + %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 0) + %3 = xor <4 x i64> %1, %2 + ret <4 x i64> %3 +} + +define <4 x i64> @commute_ymm_v2(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: commute_ymm_v2: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 16) + %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 1) + %3 = xor <4 x i64> %2, %1 + ret <4 x i64> %3 +} + +define <4 x i64> @commute_ymm_v3(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: commute_ymm_v3: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: retq + %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 17) + %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 17) + %3 = xor <4 x i64> %2, %1 + ret <4 x i64> %3 +} + +define <8 x i64> @commute_zmm_v1(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: commute_zmm_v1: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $0, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpxorq %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 0) + %2 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a1, <8 x i64> %a0, i8 0) + %3 = xor <8 x i64> %1, %2 + ret <8 x i64> %3 +} + +define <8 x i64> @commute_zmm_v2(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: commute_zmm_v2: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $16, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpxorq %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 16) + %2 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a1, <8 x i64> %a0, i8 1) + %3 = xor <8 x i64> %2, %1 + ret <8 x i64> %3 +} + +define <8 x i64> @commute_zmm_v3(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK-LABEL: commute_zmm_v3: +; CHECK: # BB#0: +; CHECK-NEXT: vpclmulqdq $17, %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpxorq %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 17) + %2 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a1, <8 x i64> %a0, i8 17) + %3 = xor <8 x i64> %2, %1 + ret <8 x i64> %3 +} + Index: llvm/trunk/test/MC/X86/avx512vlvpclmul.s =================================================================== --- llvm/trunk/test/MC/X86/avx512vlvpclmul.s +++ llvm/trunk/test/MC/X86/avx512vlvpclmul.s @@ -0,0 +1,58 @@ +//RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=skx -mattr=+vpclmulqdq --show-encoding < %s | FileCheck %s + +// CHECK: vpclmulqdq $1, %xmm3, %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x00,0x44,0xcb,0x01] + vpclmulqdq $1, %xmm3, %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, (%rcx), %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x00,0x44,0x09,0x01] + vpclmulqdq $1, (%rcx), %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, -64(%rsp), %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x00,0x44,0x4c,0x24,0xfc,0x01] + vpclmulqdq $1, -64(%rsp), %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, 64(%rsp), %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x00,0x44,0x4c,0x24,0x04,0x01] + vpclmulqdq $1, 64(%rsp), %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, 268435456(%rcx,%r14,8), %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x00,0x44,0x8c,0xf1,0x00,0x00,0x00,0x10,0x01] + vpclmulqdq $1, 268435456(%rcx,%r14,8), %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, -536870912(%rcx,%r14,8), %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x00,0x44,0x8c,0xf1,0x00,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870912(%rcx,%r14,8), %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, -536870910(%rcx,%r14,8), %xmm22, %xmm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x00,0x44,0x8c,0xf1,0x02,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870910(%rcx,%r14,8), %xmm22, %xmm1 + +// CHECK: vpclmulqdq $1, %ymm3, %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x20,0x44,0xcb,0x01] + vpclmulqdq $1, %ymm3, %ymm22, %ymm1 + +// CHECK: vpclmulqdq $1, (%rcx), %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x20,0x44,0x09,0x01] + vpclmulqdq $1, (%rcx), %ymm22, %ymm1 + +// CHECK: vpclmulqdq $1, -128(%rsp), %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x20,0x44,0x4c,0x24,0xfc,0x01] + vpclmulqdq $1, -128(%rsp), %ymm22, %ymm1 + +// CHECK: vpclmulqdq $1, 128(%rsp), %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x20,0x44,0x4c,0x24,0x04,0x01] + vpclmulqdq $1, 128(%rsp), %ymm22, %ymm1 + +// CHECK: vpclmulqdq $1, 268435456(%rcx,%r14,8), %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x20,0x44,0x8c,0xf1,0x00,0x00,0x00,0x10,0x01] + vpclmulqdq $1, 268435456(%rcx,%r14,8), %ymm22, %ymm1 + +// CHECK: vpclmulqdq $1, -536870912(%rcx,%r14,8), %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x20,0x44,0x8c,0xf1,0x00,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870912(%rcx,%r14,8), %ymm22, %ymm1 + +// CHECK: vpclmulqdq $1, -536870910(%rcx,%r14,8), %ymm22, %ymm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x20,0x44,0x8c,0xf1,0x02,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870910(%rcx,%r14,8), %ymm22, %ymm1 + Index: llvm/trunk/test/MC/X86/avx512vpclmul.s =================================================================== --- llvm/trunk/test/MC/X86/avx512vpclmul.s +++ llvm/trunk/test/MC/X86/avx512vpclmul.s @@ -0,0 +1,29 @@ +//RUN: llvm-mc -triple x86_64-unknown-unknown -mcpu=knl -mattr=+vpclmulqdq --show-encoding < %s | FileCheck %s + +// CHECK: vpclmulqdq $1, %zmm3, %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x40,0x44,0xcb,0x01] + vpclmulqdq $1, %zmm3, %zmm22, %zmm1 + +// CHECK: vpclmulqdq $1, (%rcx), %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x40,0x44,0x09,0x01] + vpclmulqdq $1, (%rcx), %zmm22, %zmm1 + +// CHECK: vpclmulqdq $1, -256(%rsp), %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x40,0x44,0x4c,0x24,0xfc,0x01] + vpclmulqdq $1, -256(%rsp), %zmm22, %zmm1 + +// CHECK: vpclmulqdq $1, 256(%rsp), %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xf3,0x4d,0x40,0x44,0x4c,0x24,0x04,0x01] + vpclmulqdq $1, 256(%rsp), %zmm22, %zmm1 + +// CHECK: vpclmulqdq $1, 268435456(%rcx,%r14,8), %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x40,0x44,0x8c,0xf1,0x00,0x00,0x00,0x10,0x01] + vpclmulqdq $1, 268435456(%rcx,%r14,8), %zmm22, %zmm1 + +// CHECK: vpclmulqdq $1, -536870912(%rcx,%r14,8), %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x40,0x44,0x8c,0xf1,0x00,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870912(%rcx,%r14,8), %zmm22, %zmm1 + +// CHECK: vpclmulqdq $1, -536870910(%rcx,%r14,8), %zmm22, %zmm1 +// CHECK: encoding: [0x62,0xb3,0x4d,0x40,0x44,0x8c,0xf1,0x02,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870910(%rcx,%r14,8), %zmm22, %zmm1 Index: llvm/trunk/test/MC/X86/vpclmulqdq.s =================================================================== --- llvm/trunk/test/MC/X86/vpclmulqdq.s +++ llvm/trunk/test/MC/X86/vpclmulqdq.s @@ -0,0 +1,30 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+vpclmulqdq --show-encoding %s | FileCheck %s + +// CHECK: vpclmulqdq $17, %ymm3, %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xe3,0x6d,0x44,0xcb,0x11] + vpclmulqdq $17, %ymm3, %ymm2, %ymm1 + +// CHECK: vpclmulqdq $1, (%rcx), %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xe3,0x6d,0x44,0x09,0x01] + vpclmulqdq $1, (%rcx), %ymm2, %ymm1 + +// CHECK: vpclmulqdq $1, -4(%rsp), %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xe3,0x6d,0x44,0x4c,0x24,0xfc,0x01] + vpclmulqdq $1, -4(%rsp), %ymm2, %ymm1 + +// CHECK: vpclmulqdq $1, 4(%rsp), %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xe3,0x6d,0x44,0x4c,0x24,0x04,0x01] + vpclmulqdq $1, 4(%rsp), %ymm2, %ymm1 + +// CHECK: vpclmulqdq $1, 268435456(%rcx,%r14,8), %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xa3,0x6d,0x44,0x8c,0xf1,0x00,0x00,0x00,0x10,0x01] + vpclmulqdq $1, 268435456(%rcx,%r14,8), %ymm2, %ymm1 + +// CHECK: vpclmulqdq $1, -536870912(%rcx,%r14,8), %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xa3,0x6d,0x44,0x8c,0xf1,0x00,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870912(%rcx,%r14,8), %ymm2, %ymm1 + +// CHECK: vpclmulqdq $1, -536870910(%rcx,%r14,8), %ymm2, %ymm1 +// CHECK: encoding: [0xc4,0xa3,0x6d,0x44,0x8c,0xf1,0x02,0x00,0x00,0xe0,0x01] + vpclmulqdq $1, -536870910(%rcx,%r14,8), %ymm2, %ymm1 +