Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -1239,6 +1239,45 @@ // Currently we haven't seen motivating example for that and we // want to avoid untested code. NumRewrittenCopies += Changed; + + // Fold simple extracts from REG_SEQUENCE to use the original source inserted + // into the super register. + // + // e.g. + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1 + // vreg3 = COPY vreg1, sub0 + // + // => vreg3 = COPY vreg0 + // + // TODO: Handle more complex cases with sub-subregisters: + // + // vreg0 = ... + // vreg1 = ... + // vreg2 = ... + // vreg3 = ... + // vreg4 = REG_SEQUENCE vreg0, sub0, vreg1, sub1 + // vreg5 = REG_SEQUENCE vreg2, sub0_sub1, vreg2, sub2, vreg3, sub3 + // vreg6 = COPY vreg3, sub0 + // => vreg6 = COPY vreg0 + MachineOperand &Src = MI->getOperand(1); + if (Src.getSubReg()) { + MachineInstr *SrcDef = MRI->getUniqueVRegDef(Src.getReg()); + if (SrcDef->getOpcode() == TargetOpcode::REG_SEQUENCE) { + for (unsigned I = 1, E = SrcDef->getNumOperands(); I != E; I += 2) { + if (Src.getSubReg() == SrcDef->getOperand(I + 1).getImm()) { + MachineOperand &Op = SrcDef->getOperand(I); + Src.setReg(Op.getReg()); + Src.setSubReg(Op.getSubReg()); + MRI->clearKillFlags(Op.getReg()); + Changed = true; + break; + } + } + } + } + return Changed; } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1856,15 +1856,10 @@ } MachineBasicBlock &MBB = *MI->getParent(); - // Extract the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + // Extract the ptr from the resource descriptor. + unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -1889,36 +1884,39 @@ .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 + // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 DebugLoc DL = MI->getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) - .addReg(SRsrcPtrLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addReg(VAddr->getReg(), 0, AMDGPU::sub0); - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 + // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) - .addReg(SRsrcPtrHi) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. @@ -1945,21 +1943,17 @@ MI->removeFromParent(); MI = Addr64; - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); } - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - // Update the instruction to use NewVaddr VAddr->setReg(NewVAddr); // Update the instruction to use NewSRsrc Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -35,14 +35,11 @@ ret void } -; FIXME: Shuffling to new superregister ; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]] -; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]] -; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]] +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Z:[0-9]+]]:[[REG_W:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[REG_Y]] ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD2]] ; CI: s_endpgm @@ -64,11 +61,15 @@ ret void } + +; FIXME: the v_lshl_b64 x, x, 32 is a bad way of doing a copy + ; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} ; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} +; CI: v_lshr_b64 v{{\[}}[[Y_COPY:[0-9]+]]:{{[0-9]+\]}}, v{{\[}}[[REG_X]]:[[REG_Y]]{{\]}}, 32 ; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] -; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[Y_COPY]], v[[ADD0]] ; CI: buffer_store_dword v[[ADD1]] ; CI: s_endpgm define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { @@ -140,13 +141,21 @@ ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-NOT: v_mov_b32 ; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-NOT: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) ; CI: buffer_store_dword Index: test/CodeGen/AMDGPU/half.ll =================================================================== --- test/CodeGen/AMDGPU/half.ll +++ test/CodeGen/AMDGPU/half.ll @@ -310,13 +310,12 @@ ret void } -; FIXME: Shouldn't do 4th conversion ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: ; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 +; GCN-NOT: v_cvt_f16_f32_e32 ; GCN: buffer_store_short ; GCN: buffer_store_dword ; GCN: s_endpgm @@ -395,38 +394,38 @@ ; GCN: buffer_load_dword ; GCN: buffer_load_dword ; GCN: buffer_load_dword -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: v_cvt_f16_f32_e32 +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_short ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in Index: test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.round.f64.ll +++ test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -21,12 +21,9 @@ ; SI-DAG: v_cmp_eq_i32 ; SI-DAG: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff -; SI-DAG: v_cmp_gt_i32_e64 +; SI-DAG: v_cmp_gt_i32_e32 ; SI-DAG: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] -; SI-DAG: v_cmp_gt_i32_e64 - - ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { Index: test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll @@ -0,0 +1,36 @@ +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCN %s + +; Check that when mubuf addr64 instruction is handled in moveToVALU +; from the pointer, dead register writes are not emitted. + +; FIXME: We should be able to use the SGPR directly as src0 to v_add_i32 + +; GCN-LABEL: {{^}}clobber_vgpr_pair_pointer_add: +; GCN: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} +; GCN: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}} + +; GCN-NOT: v_mov_b32 +; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]] +; GCN-NEXT: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]] +; GCN-NOT: v_mov_b32 + +; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]] +; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]] +; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, + +define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 { +bb: + %tmp = icmp sgt i32 %arg3, 0 + br i1 %tmp, label %bb4, label %bb17 + +bb4: + %tmp14 = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %ptrarg + %tmp15 = getelementptr inbounds i8, i8 addrspace(1)* %tmp14, i64 %arg1 + %tmp16 = load volatile i8, i8 addrspace(1)* %tmp15 + br label %bb17 + +bb17: + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/ARM/vcombine.ll =================================================================== --- test/CodeGen/ARM/vcombine.ll +++ test/CodeGen/ARM/vcombine.ll @@ -2,11 +2,15 @@ ; RUN: llc -mtriple=armeb-eabi -float-abi=soft -mattr=+neon %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE define <16 x i8> @vcombine8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -; CHECK: vcombine8 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine8 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE-DAG: vmov r0, r1, [[LD0]] +; CHECK-LE-DAG: vmov r2, r3, [[LD1]] + +; CHECK-BE-DAG: vmov r1, r0, d16 +; CHECK-BE-DAG: vmov r3, r2, d17 %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <16 x i32> @@ -14,11 +18,15 @@ } define <8 x i16> @vcombine16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -; CHECK: vcombine16 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine16 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE-DAG: vmov r0, r1, [[LD0]] +; CHECK-LE-DAG: vmov r2, r3, [[LD1]] + +; CHECK-BE-DAG: vmov r1, r0, d16 +; CHECK-BE-DAG: vmov r3, r2, d17 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <8 x i32> @@ -26,9 +34,14 @@ } define <4 x i32> @vcombine32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -; CHECK: vcombine32 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 +; CHECK-LABEL: vcombine32 + +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 %tmp1 = load <2 x i32>, <2 x i32>* %A @@ -38,9 +51,14 @@ } define <4 x float> @vcombinefloat(<2 x float>* %A, <2 x float>* %B) nounwind { -; CHECK: vcombinefloat -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 +; CHECK-LABEL: vcombinefloat + +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + ; CHECK-BE: vmov r1, r0, d16 ; CHECK-BE: vmov r3, r2, d17 %tmp1 = load <2 x float>, <2 x float>* %A @@ -50,11 +68,15 @@ } define <2 x i64> @vcombine64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -; CHECK: vcombine64 -; CHECK-LE: vmov r0, r1, d16 -; CHECK-LE: vmov r2, r3, d17 -; CHECK-BE: vmov r1, r0, d16 -; CHECK-BE: vmov r3, r2, d17 +; CHECK-LABEL: vcombine64 +; CHECK-DAG: vldr [[LD0:d[0-9]+]], [r0] +; CHECK-DAG: vldr [[LD1:d[0-9]+]], [r1] + +; CHECK-LE: vmov r0, r1, [[LD0]] +; CHECK-LE: vmov r2, r3, [[LD1]] + +; CHECK-BE: vmov r1, r0, [[LD0]] +; CHECK-BE: vmov r3, r2, [[LD1]] %tmp1 = load <1 x i64>, <1 x i64>* %A %tmp2 = load <1 x i64>, <1 x i64>* %B %tmp3 = shufflevector <1 x i64> %tmp1, <1 x i64> %tmp2, <2 x i32> Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -20,11 +20,11 @@ define <16 x i8> @vtrni8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <8 x i16> @vtrni16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vtrni16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -84,11 +84,11 @@ define <4 x i32> @vtrni32_Qres(<2 x i32>* %A, <2 x i32>* %B) nounwind { ; CHECK-LABEL: vtrni32_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.32 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B @@ -116,11 +116,11 @@ define <4 x float> @vtrnf_Qres(<2 x float>* %A, <2 x float>* %B) nounwind { ; CHECK-LABEL: vtrnf_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.32 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.32 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B @@ -281,11 +281,11 @@ define <16 x i8> @vtrni8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vtrni8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vtrn.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vtrn.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -20,11 +20,11 @@ define <16 x i8> @vuzpi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <8 x i16> @vuzpi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vuzpi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -220,11 +220,11 @@ define <16 x i8> @vuzpi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vuzpi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vuzp.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -20,11 +20,11 @@ define <16 x i8> @vzipi8_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vzip.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B @@ -52,11 +52,11 @@ define <8 x i16> @vzipi16_Qres(<4 x i16>* %A, <4 x i16>* %B) nounwind { ; CHECK-LABEL: vzipi16_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.16 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vzip.16 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B @@ -220,11 +220,11 @@ define <16 x i8> @vzipi8_undef_Qres(<8 x i8>* %A, <8 x i8>* %B) nounwind { ; CHECK-LABEL: vzipi8_undef_Qres: ; CHECK: @ BB#0: -; CHECK-NEXT: vldr d17, [r1] -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vzip.8 d16, d17 -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vzip.8 d17, d16 +; CHECK-NEXT: vmov r0, r1, d17 +; CHECK-NEXT: vmov r2, r3, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B