Index: llvm/trunk/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsARM.td +++ llvm/trunk/include/llvm/IR/IntrinsicsARM.td @@ -405,36 +405,36 @@ // De-interleaving vector loads from N-element structures. // Source operands are the address and alignment. def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; // Vector load N-element structure to one lane. // Source operands are: the address, the N input vectors (since only one // lane is assigned), the lane number, and the alignment. def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], - [llvm_ptr_ty, LLVMMatchType<0>, + [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty, LLVMMatchType<0>, + [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty, LLVMMatchType<0>, + [llvm_anyptr_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrReadArgMem]>; @@ -442,38 +442,38 @@ // Interleaving vector stores from N-element structures. // Source operands are: the address, the N vectors, and the alignment. def int_arm_neon_vst1 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, + [llvm_anyptr_ty, llvm_anyvector_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst2 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, llvm_i32_ty], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst3 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst4 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, - LLVMMatchType<0>, llvm_i32_ty], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<1>, llvm_i32_ty], [IntrReadWriteArgMem]>; // Vector store N-element structure from one lane. // Source operands are: the address, the N vectors, the lane number, and // the alignment. def int_arm_neon_vst2lane : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, llvm_i32_ty, + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst3lane : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst4lane : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, - LLVMMatchType<0>, llvm_i32_ty, + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<1>, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; // Vector bitwise select. Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Regex.h" #include using namespace llvm; @@ -92,8 +93,41 @@ F->arg_begin()->getType()); return true; } + Regex vldRegex("^arm\\.neon\\.vld([1234]|[234]lane)\\.v[a-z0-9]*$"); + if (vldRegex.match(Name)) { + auto fArgs = F->getFunctionType()->params(); + SmallVector Tys(fArgs.begin(), fArgs.end()); + // Can't use Intrinsic::getDeclaration here as the return types might + // then only be structurally equal. + FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false); + NewFn = Function::Create(fType, F->getLinkage(), + "llvm." + Name + ".p0i8", F->getParent()); + return true; + } + Regex vstRegex("^arm\\.neon\\.vst([1234]|[234]lane)\\.v[a-z0-9]*$"); + if (vstRegex.match(Name)) { + static Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1, + Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + + static Intrinsic::ID StoreLaneInts[] = {Intrinsic::arm_neon_vst2lane, + Intrinsic::arm_neon_vst3lane, + Intrinsic::arm_neon_vst4lane}; + + auto fArgs = F->getFunctionType()->params(); + Type *Tys[] = {fArgs[0], fArgs[1]}; + if (Name.find("lane") == StringRef::npos) + NewFn = Intrinsic::getDeclaration(F->getParent(), + StoreInts[fArgs.size() - 3], Tys); + else + NewFn = Intrinsic::getDeclaration(F->getParent(), + StoreLaneInts[fArgs.size() - 5], Tys); + return true; + } break; } + case 'c': { if (Name.startswith("ctlz.") && F->arg_size() == 1) { F->setName(Name + ".old"); @@ -651,6 +685,27 @@ default: llvm_unreachable("Unknown function for CallInst upgrade."); + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vld2lane: + case Intrinsic::arm_neon_vld3lane: + case Intrinsic::arm_neon_vld4lane: + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + case Intrinsic::arm_neon_vst2lane: + case Intrinsic::arm_neon_vst3lane: + case Intrinsic::arm_neon_vst4lane: { + SmallVector Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args)); + CI->eraseFromParent(); + return; + } + case Intrinsic::ctlz: case Intrinsic::cttz: assert(CI->getNumArgOperands() == 1 && Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -11802,9 +11802,6 @@ Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy); - IRBuilder<> Builder(LI); SmallVector Ops; @@ -11812,6 +11809,9 @@ Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); + Type *Tys[] = { VecTy, Int8Ptr }; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded @@ -11903,14 +11903,15 @@ static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], SubVecTy); - SmallVector Ops; Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + Type *Tys[] = { Int8Ptr, SubVecTy }; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( Index: llvm/trunk/test/Analysis/BasicAA/cs-cs.ll =================================================================== --- llvm/trunk/test/Analysis/BasicAA/cs-cs.ll +++ llvm/trunk/test/Analysis/BasicAA/cs-cs.ll @@ -2,8 +2,8 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" target triple = "arm-apple-ios" -declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind +declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind @@ -13,27 +13,27 @@ define <8 x i16> @test1(i8* %p, <8 x i16> %y) { entry: %q = getelementptr i8, i8* %p, i64 16 - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind - call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind %c = add <8 x i16> %a, %b ret <8 x i16> %c ; CHECK-LABEL: Function: test1: ; CHECK: NoAlias: i8* %p, i8* %q -; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 -; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 +; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #4 <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) } define void @test2(i8* %P, i8* %Q) nounwind ssp { Index: llvm/trunk/test/Analysis/BasicAA/intrinsics.ll =================================================================== --- llvm/trunk/test/Analysis/BasicAA/intrinsics.ll +++ llvm/trunk/test/Analysis/BasicAA/intrinsics.ll @@ -7,14 +7,14 @@ ; CHECK: define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) { ; CHECK-NEXT: entry: -; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR:#[0-9]+]] -; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]] +; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK-NEXT: %c = add <8 x i16> %a, %a define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) { entry: - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind - call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind %c = add <8 x i16> %a, %b ret <8 x i16> %c } @@ -22,21 +22,21 @@ ; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16 -; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[ATTR]] -; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR]] +; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK-NEXT: %c = add <8 x i16> %a, %a define <8 x i16> @test1(i8* %p, <8 x i16> %y) { entry: %q = getelementptr i8, i8* %p, i64 16 - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind - call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind %c = add <8 x i16> %a, %b ret <8 x i16> %c } -declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind +declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind ; CHECK: attributes #0 = { nounwind readonly argmemonly } ; CHECK: attributes #1 = { nounwind argmemonly } Index: llvm/trunk/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll =================================================================== --- llvm/trunk/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll +++ llvm/trunk/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll @@ -7,20 +7,20 @@ ; CHECK: define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) { ; CHECK-NEXT: entry: -; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) [[NUW:#[0-9]+]] -; CHECK-NEXT: call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[NUW:#[0-9]+]] +; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK-NEXT: %c = add <8 x i16> %a, %a define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) { entry: - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2 - call void @llvm.arm.neon.vst1.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1 - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %p, i32 16) nounwind, !tbaa !2 + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2 + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1 + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2 %c = add <8 x i16> %a, %b ret <8 x i16> %c } -declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind +declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind ; CHECK: attributes #0 = { nounwind readonly argmemonly } ; CHECK: attributes #1 = { nounwind argmemonly } Index: llvm/trunk/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll +++ llvm/trunk/test/CodeGen/ARM/2010-05-20-NEONSpillCrash.ll @@ -1,36 +1,36 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon -O0 -optimize-regalloc -regalloc=basic %s -o /dev/null ; This test would crash the rewriter when trying to handle a spill after one of -; the @llvm.arm.neon.vld3.v8i8 defined three parts of a register. +; the @llvm.arm.neon.vld3.v8i8.p0i8 defined three parts of a register. %struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> } -declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8*, i32) nounwind readonly +declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst3.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind define <8 x i8> @t3(i8* %A1, i8* %A2, i8* %A3, i8* %A4, i8* %A5, i8* %A6, i8* %A7, i8* %A8, i8* %B) nounwind { - %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1b = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A2, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 0 ; <<8 x i8>> [#uses=1] %tmp4b = extractvalue %struct.__neon_int8x8x3_t %tmp1b, 1 ; <<8 x i8>> [#uses=1] - %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1d = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A4, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 0 ; <<8 x i8>> [#uses=1] %tmp4d = extractvalue %struct.__neon_int8x8x3_t %tmp1d, 1 ; <<8 x i8>> [#uses=1] - %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp1e = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A5, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp2e = extractvalue %struct.__neon_int8x8x3_t %tmp1e, 0 ; <<8 x i8>> [#uses=1] - %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] + %tmp1f = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A6, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=1] %tmp2f = extractvalue %struct.__neon_int8x8x3_t %tmp1f, 0 ; <<8 x i8>> [#uses=1] - %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1g = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A7, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 0 ; <<8 x i8>> [#uses=1] %tmp4g = extractvalue %struct.__neon_int8x8x3_t %tmp1g, 1 ; <<8 x i8>> [#uses=1] - %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] + %tmp1h = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3.v8i8.p0i8(i8* %A8, i32 1) ; <%struct.__neon_int8x8x3_t> [#uses=2] %tmp2h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 0 ; <<8 x i8>> [#uses=1] %tmp3h = extractvalue %struct.__neon_int8x8x3_t %tmp1h, 2 ; <<8 x i8>> [#uses=1] %tmp2bd = add <8 x i8> %tmp2b, %tmp2d ; <<8 x i8>> [#uses=1] %tmp4bd = add <8 x i8> %tmp4b, %tmp4d ; <<8 x i8>> [#uses=1] %tmp2abcd = mul <8 x i8> undef, %tmp2bd ; <<8 x i8>> [#uses=1] %tmp4abcd = mul <8 x i8> undef, %tmp4bd ; <<8 x i8>> [#uses=2] - call void @llvm.arm.neon.vst3.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) + call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A1, <8 x i8> %tmp4abcd, <8 x i8> zeroinitializer, <8 x i8> %tmp2abcd, i32 1) %tmp2ef = sub <8 x i8> %tmp2e, %tmp2f ; <<8 x i8>> [#uses=1] %tmp2gh = sub <8 x i8> %tmp2g, %tmp2h ; <<8 x i8>> [#uses=1] %tmp3gh = sub <8 x i8> zeroinitializer, %tmp3h ; <<8 x i8>> [#uses=1] @@ -38,8 +38,8 @@ %tmp2efgh = mul <8 x i8> %tmp2ef, %tmp2gh ; <<8 x i8>> [#uses=1] %tmp3efgh = mul <8 x i8> undef, %tmp3gh ; <<8 x i8>> [#uses=1] %tmp4efgh = mul <8 x i8> %tmp4ef, undef ; <<8 x i8>> [#uses=2] - call void @llvm.arm.neon.vst3.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) + call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %A2, <8 x i8> %tmp4efgh, <8 x i8> %tmp3efgh, <8 x i8> %tmp2efgh, i32 1) %tmp4 = sub <8 x i8> %tmp4efgh, %tmp4abcd ; <<8 x i8>> [#uses=1] - tail call void @llvm.arm.neon.vst3.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) + tail call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %B, <8 x i8> zeroinitializer, <8 x i8> undef, <8 x i8> undef, i32 1) ret <8 x i8> %tmp4 } Index: llvm/trunk/test/CodeGen/ARM/2010-05-21-BuildVector.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2010-05-21-BuildVector.ll +++ llvm/trunk/test/CodeGen/ARM/2010-05-21-BuildVector.ll @@ -36,8 +36,8 @@ %tmp5 = insertelement <4 x float> %tmp7, float %18, i32 3 %19 = fmul <4 x float> %tmp5, %2 %20 = bitcast float* %fltp to i8* - tail call void @llvm.arm.neon.vst1.v4f32(i8* %20, <4 x float> %19, i32 1) + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %20, <4 x float> %19, i32 1) ret void } -declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind Index: llvm/trunk/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll +++ llvm/trunk/test/CodeGen/ARM/2010-06-11-vmovdrr-bitcast.ll @@ -12,8 +12,8 @@ %tmp9 = trunc i128 %tmp8 to i64 ; [#uses=1] %tmp16.i = bitcast i64 %tmp6 to <8 x i8> ; <<8 x i8>> [#uses=1] %tmp20.i = bitcast i64 %tmp9 to <8 x i8> ; <<8 x i8>> [#uses=1] - tail call void @llvm.arm.neon.vst2.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind + tail call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %b, <8 x i8> %tmp16.i, <8 x i8> %tmp20.i, i32 1) nounwind ret void } -declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst2.p0i8.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind Index: llvm/trunk/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll +++ llvm/trunk/test/CodeGen/ARM/2010-06-29-PartialRedefFastAlloc.ll @@ -16,10 +16,10 @@ define i32 @test(i8* %arg) nounwind { entry: - %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %arg, i32 1) + %0 = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* %arg, i32 1) %1 = shufflevector <2 x i64> undef, <2 x i64> %0, <2 x i32> store <2 x i64> %1, <2 x i64>* undef, align 16 ret i32 undef } -declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32) nounwind readonly +declare <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8*, i32) nounwind readonly Index: llvm/trunk/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll +++ llvm/trunk/test/CodeGen/ARM/2011-08-12-vmovqqqq-pseudo.ll @@ -4,9 +4,9 @@ define void @test_vmovqqqq_pseudo() nounwind ssp { entry: - %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) + %vld3_lane = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> zeroinitializer, i32 7, i32 2) store { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, { <8 x i16>, <8 x i16>, <8 x i16> }* undef ret void } -declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly +declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly Index: llvm/trunk/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll +++ llvm/trunk/test/CodeGen/ARM/2012-01-24-RegSequenceLiveRange.ll @@ -52,8 +52,8 @@ %shuffle.i35.i.i = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer %shuffle.i34.i.i = shufflevector <1 x i64> %shuffle.i36.i.i, <1 x i64> %shuffle.i35.i.i, <2 x i32> %2 = bitcast <2 x i64> %shuffle.i34.i.i to <4 x float> - tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind - tail call void @llvm.arm.neon.vst1.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %0, i32 4) nounwind + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* undef, <4 x float> %2, i32 4) nounwind unreachable for.end: ; preds = %entry @@ -63,10 +63,10 @@ ; Check that pseudo-expansion preserves flags. define void @foo3(i8* %p) nounwind ssp { entry: - tail call void @llvm.arm.neon.vst2.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) + tail call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* %p, <4 x float> undef, <4 x float> undef, i32 4) ret void } declare arm_aapcs_vfpcc void @bar(i8*, float, float, float) -declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind -declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind +declare void @llvm.arm.neon.vst2.p0i8.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind Index: llvm/trunk/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll +++ llvm/trunk/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll @@ -7,8 +7,8 @@ %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0 %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1 %0 = bitcast i32* %p to i8* - tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) + tail call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4) ret void } -declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v2i32(i8*, <2 x i32>, i32) nounwind Index: llvm/trunk/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll +++ llvm/trunk/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll @@ -5,9 +5,9 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios5.1.0" -declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind +declare void @llvm.arm.neon.vst1.p0i8.v16i8(i8*, <16 x i8>, i32) nounwind define void @findEdges(i8*) nounwind ssp { %2 = icmp sgt i32 undef, 0 @@ -19,16 +19,16 @@ ;