Index: include/llvm/Target/TargetRegisterInfo.h =================================================================== --- include/llvm/Target/TargetRegisterInfo.h +++ include/llvm/Target/TargetRegisterInfo.h @@ -504,6 +504,25 @@ getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned Idx) const; + /// \brief Check if the registers defined by the pair (RegisterClass, SubReg) + /// share the same register file. + bool shareSameRegisterFile(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const; + + // For a copy-like instruction that defines a register of class DefRC with + // subreg index DefSubReg, reading from another source with class SrcRC and + // subregister SrcSubReg return true if this is a preferrable copy + // instruction or an earlier use should be used. + virtual bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // If this source does not incur a cross register bank copy, use it. + return shareSameRegisterFile(DefRC, DefSubReg, SrcRC, SrcSubReg); + } + /// getSubClassWithSubReg - Returns the largest legal sub-class of RC that /// supports the sub-register index Idx. /// If no such sub-class exists, return NULL. Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -584,36 +584,6 @@ return TII->optimizeCondBranch(MI); } -/// \brief Check if the registers defined by the pair (RegisterClass, SubReg) -/// share the same register file. -static bool shareSameRegisterFile(const TargetRegisterInfo &TRI, - const TargetRegisterClass *DefRC, - unsigned DefSubReg, - const TargetRegisterClass *SrcRC, - unsigned SrcSubReg) { - // Same register class. - if (DefRC == SrcRC) - return true; - - // Both operands are sub registers. Check if they share a register class. - unsigned SrcIdx, DefIdx; - if (SrcSubReg && DefSubReg) - return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, - SrcIdx, DefIdx) != nullptr; - // At most one of the register is a sub register, make it Src to avoid - // duplicating the test. - if (!SrcSubReg) { - std::swap(DefSubReg, SrcSubReg); - std::swap(DefRC, SrcRC); - } - - // One of the register is a sub register, check if we can get a superclass. - if (SrcSubReg) - return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; - // Plain copy. - return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr; -} - /// \brief Try to find the next source that share the same register file /// for the value defined by \p Reg and \p SubReg. /// When true is returned, the \p RewriteMap can be used by the client to @@ -694,10 +664,8 @@ return false; const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg); - - // If this source does not incur a cross register bank copy, use it. - ShouldRewrite = shareSameRegisterFile(*TRI, DefRC, SubReg, SrcRC, - CurSrcPair.SubReg); + ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, + CurSrcPair.SubReg); } while (!ShouldRewrite); // Continue looking for new sources... Index: lib/CodeGen/TargetRegisterInfo.cpp =================================================================== --- lib/CodeGen/TargetRegisterInfo.cpp +++ lib/CodeGen/TargetRegisterInfo.cpp @@ -273,6 +273,36 @@ return BestRC; } +bool TargetRegisterInfo::shareSameRegisterFile(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // Same register class. + if (DefRC == SrcRC) + return true; + + // Both operands are sub registers. Check if they share a register class. + unsigned SrcIdx, DefIdx; + if (SrcSubReg && DefSubReg) { + return getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg, + SrcIdx, DefIdx) != nullptr; + } + + // At most one of the register is a sub register, make it Src to avoid + // duplicating the test. + if (!SrcSubReg) { + std::swap(DefSubReg, SrcSubReg); + std::swap(DefRC, SrcRC); + } + + // One of the register is a sub register, check if we can get a superclass. + if (SrcSubReg) + return getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr; + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + // Compute target-independent register allocator hints to help eliminate copies. void TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg, Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -79,6 +79,11 @@ const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// \p Channel This is the register channel (e.g. a value from 0-16), not the /// SubReg index. /// \returns The sub-register of Reg that is in Channel. Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -401,6 +401,30 @@ } } +bool SIRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We want to prefer the smallest register class possible, so we don't want to + // stop and rewrite on anything that looks like a subregister + // extract. Operations mostly don't care about the super register class, so we + // only want to stop on the most basic of copies between the smae register + // class. + // + // e.g. if we have something like + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 + // vreg3 = COPY vreg2, sub0 + // + // We want to look through the COPY to find: + // => vreg3 = COPY vreg0 + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -35,14 +35,11 @@ ret void } -; FIXME: Shuffling to new superregister ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]] -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]] -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]] +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm @@ -64,11 +61,15 @@ ret void } + +; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy + ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} +; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { @@ -140,13 +141,21 @@ ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-NOT: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dword Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -310,13 +310,12 @@ ret void } -; FIXME: Shouldn't do 4th conversion ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-NOT: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm @@ -395,38 +394,38 @@ ; GCN: buffer_load_dword ; GCN: buffer_load_dword ; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in Index: test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.round.f64.ll +++ test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -21,12 +21,9 @@ ; SI-DAG: v_cmp_eq_i32 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_cmp_gt_i32_e32 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] -; SI-DAG: v_cmp_gt_i32_e64 - - ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s + +; Check that when mubuf addr64 instruction is handled in moveToVALU +; from the pointer, dead register writes are not emitted. + +; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 + +; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: +; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NOT: v_mov_b32 + +; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] +; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] +; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, + +define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +bb: + %tmp = icmp sgt i32 %arg3, 0 + br i1 %tmp, label %bb4, label %bb17 + +bb4: + %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg + %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1 + %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15 + br label %bb17 + +bb17: + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/ARM/vcombine.ll =================================================================== --- test/CodeGen/ARM/vcombine.ll +++ test/CodeGen/ARM/vcombine.ll @@ -2,11 +2,15 @@ ; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -; CHECK: vcombine8 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine8 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE-DAG: vmov r0, r1, [[LD0]] +; CHECK-LE-DAG: vmov r2, r3, [[LD1]] + +; CHECK-BE-DAG: vmov r1, r0, d16 +; CHECK-BE-DAG: vmov r3, r2, d17 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> @@ -14,11 +18,15 @@ } define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -; CHECK: vcombine16 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine16 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE-DAG: vmov r0, r1, [[LD0]] +; CHECK-LE-DAG: vmov r2, r3, [[LD1]] + +; CHECK-BE-DAG: vmov r1, r0, d16 +; CHECK-BE-DAG: vmov r3, r2, d17 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> @@ -26,9 +34,14 @@ } define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -; CHECK: vcombine32 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 +; CHECK-LABEL: vcombine32 + +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 %tmp1 = load <2 x i32>, <2 x i32>* %A @@ -38,9 +51,14 @@ } define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind { -; CHECK: vcombinefloat -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 +; CHECK-LABEL: vcombinefloat + +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 %tmp1 = load <2 x float>, <2 x float>* %A @@ -50,11 +68,15 @@ } define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -; CHECK: vcombine64 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine64 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + +; CHECK-BE: vmov r1, r0, [[LD0]] +; CHECK-BE: vmov r3, r2, [[LD1]] %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -20,11 +20,11 @@ define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vtrni16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.16 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -84,11 +84,11 @@ define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: vtrni32_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.32 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -116,11 +116,11 @@ define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind { ; CHECK-LABEL: vtrnf_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.32 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.32 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -281,11 +281,11 @@ define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vtrn.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -20,11 +20,11 @@ define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vuzpi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vuzp.16 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -220,11 +220,11 @@ define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vuzp.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -20,11 +20,11 @@ define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vzip.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vzipi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vzip.16 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -220,11 +220,11 @@ define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr [[LDR1:d[0-9]+]], [r1] +; CHECK-NEXT: vldr [[LDR0:d[0-9]+]], [r0] +; CHECK-NEXT: vzip.8 [[LDR0]], [[LDR1]] +; CHECK-NEXT: vmov r0, r1, [[LDR0]] +; CHECK-NEXT: vmov r2, r3, [[LDR1]] ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B