Index: include/llvm/IR/IntrinsicsARM.td =================================================================== --- include/llvm/IR/IntrinsicsARM.td +++ include/llvm/IR/IntrinsicsARM.td @@ -408,15 +408,15 @@ [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty, llvm_i32_ty], + [llvm_anyptr_ty, llvm_i32_ty], [IntrReadArgMem]>; // Vector load N-element structure to one lane. @@ -445,17 +445,17 @@ [llvm_ptr_ty, llvm_anyvector_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst2 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, llvm_i32_ty], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst3 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrReadWriteArgMem]>; def int_arm_neon_vst4 : Intrinsic<[], - [llvm_ptr_ty, llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, - LLVMMatchType<0>, llvm_i32_ty], + [llvm_anyptr_ty, llvm_anyvector_ty, + LLVMMatchType<1>, LLVMMatchType<1>, + LLVMMatchType<1>, llvm_i32_ty], [IntrReadWriteArgMem]>; // Vector store N-element structure from one lane. Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Regex.h" #include using namespace llvm; @@ -92,8 +93,32 @@ F->arg_begin()->getType()); return true; } + Regex vldRegex("^arm\\.neon\\.vld[234]\\.v[a-z0-9]*$"); + if (vldRegex.match(Name)) { + auto fArgs = F->getFunctionType()->params(); + Type* Tys[] = {fArgs[0], fArgs[1]}; + // Can't use Intrinsic::getDeclaration here as the return types might + // then only be structurally equal. + FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false); + NewFn = Function::Create(fType, F->getLinkage(), + "llvm." + Name + ".p0i8", F->getParent()); + return true; + } + Regex vstRegex("^arm\\.neon\\.vst[234]\\.v[a-z0-9]*$"); + if (vstRegex.match(Name)) { + static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; + + auto fArgs = F->getFunctionType()->params(); + Type *Tys[] = {fArgs[0], fArgs[1]}; + NewFn = Intrinsic::getDeclaration(F->getParent(), + StoreInts[fArgs.size() - 4], Tys); + return true; + } break; } + case 'c': { if (Name.startswith("ctlz.") && F->arg_size() == 1) { F->setName(Name + ".old"); @@ -637,6 +662,19 @@ default: llvm_unreachable("Unknown function for CallInst upgrade."); + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: { + SmallVector Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args)); + CI->eraseFromParent(); + return; + } + case Intrinsic::ctlz: case Intrinsic::cttz: assert(CI->getNumArgOperands() == 1 && Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -11689,9 +11689,6 @@ Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy); - IRBuilder<> Builder(LI); SmallVector Ops; @@ -11699,6 +11696,9 @@ Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); + Type *Tys[] = { VecTy, Int8Ptr }; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded @@ -11790,14 +11790,15 @@ static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], SubVecTy); - SmallVector Ops; Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + Type *Tys[] = { Int8Ptr, SubVecTy }; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( Index: test/CodeGen/ARM/arm-interleaved-accesses.ll =================================================================== --- test/CodeGen/ARM/arm-interleaved-accesses.ll +++ test/CodeGen/ARM/arm-interleaved-accesses.ll @@ -202,3 +202,24 @@ store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4 ret void } + +; The following test cases check that address spaces are properly handled + +; CHECK-LABEL: load_address_space +; CHECK: vld3.32 +define void @load_address_space(<4 x i32> addrspace(1)* %A, <2 x i32>* %B) { + %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %A + %interleaved = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> + store <2 x i32> %interleaved, <2 x i32>* %B + ret void +} + +; CHECK-LABEL: store_address_space +; CHECK: vst2.32 +define void @store_address_space(<2 x i32>* %A, <2 x i32>* %B, <4 x i32> addrspace(1)* %C) { + %tmp0 = load <2 x i32>, <2 x i32>* %A + %tmp1 = load <2 x i32>, <2 x i32>* %B + %interleaved = shufflevector <2 x i32> %tmp0, <2 x i32> %tmp1, <4 x i32> + store <4 x i32> %interleaved, <4 x i32> addrspace(1)* %C + ret void +} Index: test/Transforms/InstCombine/neon-intrinsics.ll =================================================================== --- test/Transforms/InstCombine/neon-intrinsics.ll +++ test/Transforms/InstCombine/neon-intrinsics.ll @@ -3,8 +3,8 @@ ; The alignment arguments for NEON load/store intrinsics can be increased ; by instcombine. Check for this. -; CHECK: vld4.v2i32({{.*}}, i32 32) -; CHECK: vst4.v2i32({{.*}}, i32 16) +; CHECK: vld4.v2i32.p0i8({{.*}}, i32 32) +; CHECK: vst4.p0i8.v2i32({{.*}}, i32 16) @x = common global [8 x i32] zeroinitializer, align 32 @y = common global [8 x i32] zeroinitializer, align 16 @@ -12,14 +12,14 @@ %struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } define void @test() nounwind ssp { - %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* bitcast ([8 x i32]* @x to i8*), i32 1) + %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8* bitcast ([8 x i32]* @x to i8*), i32 1) %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 1 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 3 - call void @llvm.arm.neon.vst4.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1) + call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* bitcast ([8 x i32]* @y to i8*), <2 x i32> %tmp2, <2 x i32> %tmp3, <2 x i32> %tmp4, <2 x i32> %tmp5, i32 1) ret void } -declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind +declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst4.p0i8.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind