Index: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4214,6 +4214,7 @@ case ARM::VLD3d32Pseudo: case ARM::VLD1d64TPseudo: case ARM::VLD1d64TPseudoWB_fixed: + case ARM::VLD1d64TPseudoWB_register: case ARM::VLD3d8Pseudo_UPD: case ARM::VLD3d16Pseudo_UPD: case ARM::VLD3d32Pseudo_UPD: @@ -4231,6 +4232,7 @@ case ARM::VLD4d32Pseudo: case ARM::VLD1d64QPseudo: case ARM::VLD1d64QPseudoWB_fixed: + case ARM::VLD1d64QPseudoWB_register: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: Index: llvm/trunk/lib/Target/ARM/ARMExpandPseudoInsts.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ llvm/trunk/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -156,8 +156,10 @@ { ARM::VLD1d64QPseudo, ARM::VLD1d64Q, true, false, false, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64QPseudoWB_fixed, ARM::VLD1d64Qwb_fixed, true, true, false, SingleSpc, 4, 1 ,false}, +{ ARM::VLD1d64QPseudoWB_register, ARM::VLD1d64Qwb_register, true, true, true, SingleSpc, 4, 1 ,false}, { ARM::VLD1d64TPseudo, ARM::VLD1d64T, true, false, false, SingleSpc, 3, 1 ,false}, { ARM::VLD1d64TPseudoWB_fixed, ARM::VLD1d64Twb_fixed, true, true, false, SingleSpc, 3, 1 ,false}, +{ ARM::VLD1d64TPseudoWB_register, ARM::VLD1d64Twb_register, true, true, true, SingleSpc, 3, 1 ,false}, { ARM::VLD2LNd16Pseudo, ARM::VLD2LNd16, true, false, false, SingleSpc, 2, 4 ,true}, { ARM::VLD2LNd16Pseudo_UPD, ARM::VLD2LNd16_UPD, true, true, true, SingleSpc, 2, 4 ,true}, @@ -1503,6 +1505,7 @@ case ARM::VLD3d32Pseudo: case ARM::VLD1d64TPseudo: case ARM::VLD1d64TPseudoWB_fixed: + case ARM::VLD1d64TPseudoWB_register: case ARM::VLD3d8Pseudo_UPD: case ARM::VLD3d16Pseudo_UPD: case ARM::VLD3d32Pseudo_UPD: @@ -1520,6 +1523,7 @@ case ARM::VLD4d32Pseudo: case ARM::VLD1d64QPseudo: case ARM::VLD1d64QPseudoWB_fixed: + case ARM::VLD1d64QPseudoWB_register: case ARM::VLD4d8Pseudo_UPD: case ARM::VLD4d16Pseudo_UPD: case ARM::VLD4d32Pseudo_UPD: Index: llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1794,15 +1794,17 @@ Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); - // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0 - // case entirely when the rest are updated to that form, too. bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs); - if ((NumVecs <= 2) && !IsImmUpdate) - Opc = getVLDSTRegisterUpdateOpcode(Opc); - // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so - // check for that explicitly too. Horribly hacky, but temporary. - if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate) - Ops.push_back(IsImmUpdate ? Reg0 : Inc); + if (!IsImmUpdate) { + // We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so + // check for the opcode rather than the number of vector elements. + if (isVLDfixed(Opc)) + Opc = getVLDSTRegisterUpdateOpcode(Opc); + Ops.push_back(Inc); + // VLD1/VLD2 fixed increment does not need Reg0 so only include it in + // the operands if not such an opcode. + } else if (!isVLDfixed(Opc)) + Ops.push_back(Reg0); } Ops.push_back(Pred); Ops.push_back(Reg0); @@ -1948,16 +1950,17 @@ Ops.push_back(Align); if (isUpdating) { SDValue Inc = N->getOperand(AddrOpIdx + 1); - // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0 - // case entirely when the rest are updated to that form, too. bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs); - if (NumVecs <= 2 && !IsImmUpdate) - Opc = getVLDSTRegisterUpdateOpcode(Opc); - // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so - // check for that explicitly too. Horribly hacky, but temporary. - if (!IsImmUpdate) + if (!IsImmUpdate) { + // We use a VST1 for v1i64 even if the pseudo says VST2/3/4, so + // check for the opcode rather than the number of vector elements. + if (isVSTfixed(Opc)) + Opc = getVLDSTRegisterUpdateOpcode(Opc); Ops.push_back(Inc); - else if (NumVecs > 2 && !isVSTfixed(Opc)) + } + // VST1/VST2 fixed increment does not need Reg0 so only include it in + // the operands if not such an opcode. + else if (!isVSTfixed(Opc)) Ops.push_back(Reg0); } Ops.push_back(SrcReg); Index: llvm/trunk/test/CodeGen/ARM/vld3.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vld3.ll +++ llvm/trunk/test/CodeGen/ARM/vld3.ll @@ -96,6 +96,19 @@ ret <1 x i64> %tmp4 } +define <1 x i64> @vld3i64_reg_update(i64** %ptr, i64* %A) nounwind { +;CHECK-LABEL: vld3i64_reg_update: +;CHECK: vld1.64 {d16, d17, d18}, [{{r[0-9]+|lr}}:64], {{r[0-9]+|lr}} + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64.p0i8(i8* %tmp0, i32 16) + %tmp5 = getelementptr i64, i64* %A, i32 1 + store i64* %tmp5, i64** %ptr + %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2 + %tmp4 = add <1 x i64> %tmp2, %tmp3 + ret <1 x i64> %tmp4 +} + define <16 x i8> @vld3Qi8(i8* %A) nounwind { ;CHECK-LABEL: vld3Qi8: ;Check the alignment value. Max for this instruction is 64 bits: Index: llvm/trunk/test/CodeGen/ARM/vld4.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vld4.ll +++ llvm/trunk/test/CodeGen/ARM/vld4.ll @@ -96,6 +96,19 @@ ret <1 x i64> %tmp4 } +define <1 x i64> @vld4i64_reg_update(i64** %ptr, i64* %A) nounwind { +;CHECK-LABEL: vld4i64_reg_update: +;CHECK: vld1.64 {d16, d17, d18, d19}, [{{r[0-9]+|lr}}:256], {{r[0-9]+|lr}} + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64.p0i8(i8* %tmp0, i32 64) + %tmp5 = getelementptr i64, i64* %A, i32 1 + store i64* %tmp5, i64** %ptr + %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0 + %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2 + %tmp4 = add <1 x i64> %tmp2, %tmp3 + ret <1 x i64> %tmp4 +} + define <16 x i8> @vld4Qi8(i8* %A) nounwind { ;CHECK-LABEL: vld4Qi8: ;Check the alignment value. Max for this instruction is 256 bits: Index: llvm/trunk/test/CodeGen/ARM/vst3.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vst3.ll +++ llvm/trunk/test/CodeGen/ARM/vst3.ll @@ -73,6 +73,18 @@ ret void } +define void @vst3i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vst3i64_reg_update +;CHECK: vst1.64 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}], r{{.*}} + %A = load i64*, i64** %ptr + %tmp0 = bitcast i64* %A to i8* + %tmp1 = load <1 x i64>, <1 x i64>* %B + call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1) + %tmp2 = getelementptr i64, i64* %A, i32 1 + store i64* %tmp2, i64** %ptr + ret void +} + define void @vst3Qi8(i8* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vst3Qi8: ;Check the alignment value. Max for this instruction is 64 bits: Index: llvm/trunk/test/CodeGen/ARM/vst4.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vst4.ll +++ llvm/trunk/test/CodeGen/ARM/vst4.ll @@ -72,6 +72,18 @@ ret void } +define void @vst4i64_reg_update(i64** %ptr, <1 x i64>* %B) nounwind { +;CHECK-LABEL: vst4i64_reg_update: +;CHECK: vst1.64 {d16, d17, d18, d19}, [r{{[0-9]+}}], r{{[0-9]+}} + %A = load i64*, i64** %ptr + %tmp0 = bitcast i64* %A to i8* + %tmp1 = load <1 x i64>, <1 x i64>* %B + call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 1) + %tmp2 = getelementptr i64, i64* %A, i32 1 + store i64* %tmp2, i64** %ptr + ret void +} + define void @vst4Qi8(i8* %A, <16 x i8>* %B) nounwind { ;CHECK-LABEL: vst4Qi8: ;Check the alignment value. Max for this instruction is 256 bits: