Index: include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- include/llvm/IR/IntrinsicsAArch64.td +++ include/llvm/IR/IntrinsicsAArch64.td @@ -103,6 +103,14 @@ def int_aarch64_neon_vsqrshrn : Neon_N2V_Narrow_Intrinsic; def int_aarch64_neon_vuqrshrn : Neon_N2V_Narrow_Intrinsic; +//128-bit load/store +def int_aarch64_vldrq : Intrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty, llvm_i32_ty], + [IntrReadArgMem]>; +def int_aarch64_vstrq : Intrinsic<[], + [llvm_ptr_ty, llvm_anyvector_ty, + llvm_i32_ty], [IntrReadWriteArgMem]>; + // Vector across class Neon_Across_Intrinsic : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; @@ -325,6 +333,9 @@ // Signed Saturating Doubling Multiply-Subtract Long def int_aarch64_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic; +def int_aarch64_neon_vmull_p64 : + Intrinsic<[llvm_v16i8_ty], [llvm_v1i64_ty, llvm_v1i64_ty], [IntrNoMem]>; + class Neon_2Arg_ShiftImm_Intrinsic : Intrinsic<[llvm_v1i64_ty], [llvm_v1i64_ty, llvm_i32_ty], [IntrNoMem]>; Index: lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1559,6 +1559,58 @@ }; return SelectVLDSTLane(Node, false, false, 4, Opcodes); } + case Intrinsic::aarch64_vldrq: { + SDLoc dl(Node); + SmallVector Ops; + + // Push back the Memory Address. + Ops.push_back(Node->getOperand(2)); + // Push back the offset 0. + Ops.push_back(CurDAG->getConstant(0, MVT::i32, false)); + // Push back the Chain + Ops.push_back(Node->getOperand(0)); + + SmallVector ResTys; + // Push back the type of return super register + ResTys.push_back(Node->getValueType(0)); + ResTys.push_back(MVT::Other); // Type of the Chain + SDNode *VLd = CurDAG->getMachineNode(AArch64::LSFP128_LDR, + dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Node)->getMemOperand(); + cast(VLd)->setMemRefs(MemOp, MemOp + 1); + return VLd; + } + case Intrinsic::aarch64_vstrq: { + SDLoc dl(Node); + SmallVector Ops; + + unsigned Vec0Idx = 3; + SmallVector Regs(Node->op_begin() + Vec0Idx, + Node->op_begin() + Vec0Idx + 1); + // Push back the source register + Ops.push_back(createQTuple(Regs)); + // Push back the Memory Address. + Ops.push_back(Node->getOperand(2)); + // Push back the offset 0. + Ops.push_back(CurDAG->getConstant(0, MVT::i32, false)); + // Push back the Chain + Ops.push_back(Node->getOperand(0)); + + // Transfer memoperands. + SmallVector ResTys; + ResTys.push_back(MVT::Other); // Type for the Chain + SDNode *VSt = CurDAG->getMachineNode(AArch64::LSFP128_STR, + dl, ResTys, Ops); + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Node)->getMemOperand(); + cast(VSt)->setMemRefs(MemOp, MemOp + 1); + + return VSt; + } } // End of switch IntNo break; } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4543,6 +4543,7 @@ case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: + case Intrinsic::aarch64_vldrq: case Intrinsic::aarch64_neon_vld1x2: case Intrinsic::aarch64_neon_vld1x3: case Intrinsic::aarch64_neon_vld1x4: @@ -4566,6 +4567,7 @@ case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: + case Intrinsic::aarch64_vstrq: case Intrinsic::aarch64_neon_vst1x2: case Intrinsic::aarch64_neon_vst1x3: case Intrinsic::aarch64_neon_vst1x4: Index: lib/Target/AArch64/AArch64InstrNEON.td =================================================================== --- lib/Target/AArch64/AArch64InstrNEON.td +++ lib/Target/AArch64/AArch64InstrNEON.td @@ -3021,19 +3021,19 @@ int_arm_neon_vqsubs>; multiclass NeonI_3VDL_v3 opcode, string asmop, - SDPatternOperator opnode, bit Commutable = 0> { + SDPatternOperator opnode_8h8b, + SDPatternOperator opnode_1q1d, bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", - opnode, VPR128, VPR64, v8i16, v8i8>; + opnode_8h8b, VPR128, VPR64, v8i16, v8i8>; - def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode, - (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm), - asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d", - [], NoItinerary>; + def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d", + opnode_1q1d, VPR128, VPR64, v16i8, v1i64>; } } -defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>; +defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, + int_aarch64_neon_vmull_p64, 1>; multiclass NeonI_3VDL2_2Op_mull_v3 opcode, string asmop, string opnode, bit Commutable = 0> { @@ -3042,10 +3042,17 @@ !cast(opnode # "_16B"), v8i16, v16i8>; - def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode, - (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), - asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d", - [], NoItinerary>; + def _1q2d : + NeonI_3VDiff<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d", + [(set (v16i8 VPR128:$Rd), + (v16i8 (int_aarch64_neon_vmull_p64 + (v1i64 (scalar_to_vector + (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))), + (v1i64 (scalar_to_vector + (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))], + NoItinerary>; } } Index: test/CodeGen/AArch64/128bit_load_store.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/128bit_load_store.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s + +define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 { +; CHECK: test_vstrq_p128 +; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + %0 = bitcast i128* %ptr to i8* + %1 = bitcast i128 %val to <16 x i8> + tail call void @llvm.aarch64.vstrq.v16i8(i8* %0, <16 x i8> %1, i32 16) + ret void +} + +declare void @llvm.aarch64.vstrq.v16i8(i8*, <16 x i8>, i32) #1 + +define i128 @test_vldrq_p128(i128* readonly %ptr) #2 { +; CHECK: test_vldrq_p128 +; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}] +entry: + %0 = bitcast i128* %ptr to i8* + %vldrq = tail call <16 x i8> @llvm.aarch64.vldrq.v16i8(i8* %0, i32 16) + %conv = bitcast <16 x i8> %vldrq to i128 + ret i128 %conv +} + +declare <16 x i8> @llvm.aarch64.vldrq.v16i8(i8*, i32) #3 Index: test/CodeGen/AArch64/neon-3vdiff.ll =================================================================== --- test/CodeGen/AArch64/neon-3vdiff.ll +++ test/CodeGen/AArch64/neon-3vdiff.ll @@ -1804,3 +1804,30 @@ ret <8 x i16> %vmull.i.i } +define i128 @test_vmull_p64(i64 %a, i64 %b) #4 { +; CHECK: test_vmull_p64 +; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d +entry: + %vmull.i = insertelement <1 x i64> undef, i64 %a, i32 0 + %vmull1.i = insertelement <1 x i64> undef, i64 %b, i32 0 + %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i, <1 x i64> %vmull1.i) #1 + %vmull3.i = bitcast <16 x i8> %vmull2.i to i128 + ret i128 %vmull3.i +} + +define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #4 { +; CHECK: test_vmull_high_p64 +; CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +entry: + %0 = extractelement <2 x i64> %a, i32 1 + %1 = extractelement <2 x i64> %b, i32 1 + %vmull.i.i = insertelement <1 x i64> undef, i64 %0, i32 0 + %vmull1.i.i = insertelement <1 x i64> undef, i64 %1, i32 0 + %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64> %vmull.i.i, <1 x i64> %vmull1.i.i) #1 + %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128 + ret i128 %vmull3.i.i +} + +declare <16 x i8> @llvm.aarch64.neon.vmull.p64(<1 x i64>, <1 x i64>) #5 + +