Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -222,10 +222,11 @@ const uint16_t *QOpcodes); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs - /// should be 2, 3 or 4. The opcode array specifies the instructions used + /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. (Q registers are not supported.) void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *Opcodes); + const uint16_t *DOpcodes, + const uint16_t *QOpcodes = nullptr); /// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2, /// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be @@ -1761,6 +1762,12 @@ case ARM::VLD1q16wb_fixed : return true; case ARM::VLD1q32wb_fixed : return true; case ARM::VLD1q64wb_fixed : return true; + case ARM::VLD1DUPd8wb_fixed : return true; + case ARM::VLD1DUPd16wb_fixed : return true; + case ARM::VLD1DUPd32wb_fixed : return true; + case ARM::VLD1DUPq8wb_fixed : return true; + case ARM::VLD1DUPq16wb_fixed : return true; + case ARM::VLD1DUPq32wb_fixed : return true; case ARM::VLD2d8wb_fixed : return true; case ARM::VLD2d16wb_fixed : return true; case ARM::VLD2d32wb_fixed : return true; @@ -1815,6 +1822,12 @@ case ARM::VLD1d64Qwb_fixed: return ARM::VLD1d64Qwb_register; case ARM::VLD1d64TPseudoWB_fixed: return ARM::VLD1d64TPseudoWB_register; case ARM::VLD1d64QPseudoWB_fixed: return ARM::VLD1d64QPseudoWB_register; + case ARM::VLD1DUPd8wb_fixed : return ARM::VLD1DUPd8wb_register; + case ARM::VLD1DUPd16wb_fixed : return ARM::VLD1DUPd16wb_register; + case ARM::VLD1DUPd32wb_fixed : return ARM::VLD1DUPd32wb_register; + case ARM::VLD1DUPq8wb_fixed : return ARM::VLD1DUPq8wb_register; + case ARM::VLD1DUPq16wb_fixed : return ARM::VLD1DUPq16wb_register; + case ARM::VLD1DUPq32wb_fixed : return ARM::VLD1DUPq32wb_register; case ARM::VST1d8wb_fixed: return ARM::VST1d8wb_register; case ARM::VST1d16wb_fixed: return ARM::VST1d16wb_register; @@ -2255,8 +2268,9 @@ } void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs, - const uint16_t *Opcodes) { - assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); + const uint16_t *DOpcodes, + const uint16_t *QOpcodes) { + assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); SDValue MemAddr, Align; @@ -2284,19 +2298,21 @@ } Align = CurDAG->getTargetConstant(Alignment, dl, MVT::i32); - unsigned OpcodeIndex; + unsigned Opc; switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("unhandled vld-dup type"); - case MVT::v8i8: OpcodeIndex = 0; break; - case MVT::v4i16: OpcodeIndex = 1; break; + case MVT::v8i8: Opc = DOpcodes[0]; break; + case MVT::v16i8: Opc = QOpcodes[0]; break; + case MVT::v4i16: Opc = DOpcodes[1]; break; + case MVT::v8i16: Opc = QOpcodes[1]; break; case MVT::v2f32: - case MVT::v2i32: OpcodeIndex = 2; break; + case MVT::v2i32: Opc = DOpcodes[2]; break; + case MVT::v4f32: + case MVT::v4i32: Opc = QOpcodes[2]; break; } SDValue Pred = getAL(CurDAG, dl); SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); - SDValue SuperReg; - unsigned Opc = Opcodes[OpcodeIndex]; SmallVector Ops; Ops.push_back(MemAddr); Ops.push_back(Align); @@ -2304,6 +2320,8 @@ // fixed-stride update instructions don't have an explicit writeback // operand. It's implicit in the opcode itself. SDValue Inc = N->getOperand(2); + if (NumVecs <= 2 && !isa(Inc.getNode())) + Opc = getVLDSTRegisterUpdateOpcode(Opc); if (!isa(Inc.getNode())) Ops.push_back(Inc); // FIXME: VLD3 and VLD4 haven't been updated to that form yet. @@ -2322,14 +2340,18 @@ ResTys.push_back(MVT::Other); SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); cast(VLdDup)->setMemRefs(MemOp, MemOp + 1); - SuperReg = SDValue(VLdDup, 0); // Extract the subregisters. - static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering"); - unsigned SubIdx = ARM::dsub_0; - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - ReplaceUses(SDValue(N, Vec), - CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + if (NumVecs == 1) { + ReplaceUses(SDValue(N, 0), SDValue(VLdDup, 0)); + } else { + SDValue SuperReg = SDValue(VLdDup, 0); + static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering"); + unsigned SubIdx = ARM::dsub_0; + for (unsigned Vec = 0; Vec < NumVecs; ++Vec) + ReplaceUses(SDValue(N, Vec), + CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg)); + } ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1)); if (isUpdating) ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2)); @@ -3269,6 +3291,15 @@ return; } + case ARMISD::VLD1DUP: { + static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8, ARM::VLD1DUPd16, + ARM::VLD1DUPd32 }; + static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8, ARM::VLD1DUPq16, + ARM::VLD1DUPq32 }; + SelectVLDDup(N, false, 1, DOpcodes, QOpcodes); + return; + } + case ARMISD::VLD2DUP: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16, ARM::VLD2DUPd32 }; @@ -3292,6 +3323,17 @@ return; } + case ARMISD::VLD1DUP_UPD: { + static const uint16_t DOpcodes[] = { ARM::VLD1DUPd8wb_fixed, + ARM::VLD1DUPd16wb_fixed, + ARM::VLD1DUPd32wb_fixed }; + static const uint16_t QOpcodes[] = { ARM::VLD1DUPq8wb_fixed, + ARM::VLD1DUPq16wb_fixed, + ARM::VLD1DUPq32wb_fixed }; + SelectVLDDup(N, true, 1, DOpcodes, QOpcodes); + return; + } + case ARMISD::VLD2DUP_UPD: { static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed, ARM::VLD2DUPd16wb_fixed, Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -190,7 +190,8 @@ MEMCPY, // Vector load N-element structure to all lanes: - VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, + VLD2DUP, VLD3DUP, VLD4DUP, @@ -202,6 +203,7 @@ VLD2LN_UPD, VLD3LN_UPD, VLD4LN_UPD, + VLD1DUP_UPD, VLD2DUP_UPD, VLD3DUP_UPD, VLD4DUP_UPD, Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1428,6 +1428,7 @@ case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; + case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -1438,6 +1439,7 @@ case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; + case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; @@ -10431,6 +10433,7 @@ isLaneOp = true; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode for Neon base update"); + case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; @@ -10545,8 +10548,8 @@ StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); } - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, - Ops, AlignedVecTy, + EVT LoadVT = isLaneOp ? AlignedVecTy.getVectorElementType() : AlignedVecTy; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, MemN->getMemOperand()); // Update the uses. @@ -10691,6 +10694,27 @@ return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } +/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. +static SDValue PerformVDUPCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDValue Op = N->getOperand(0); + + if (Op.getOpcode() == ISD::LOAD && Op.hasOneUse()) { + LoadSDNode *LD = cast(Op.getNode()); + SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), + DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; + SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); + SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, + Ops, LD->getMemoryVT(), + LD->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); + return VLDDup; + } + + return SDValue(); +} + static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); @@ -11518,6 +11542,7 @@ case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); @@ -11533,6 +11558,7 @@ case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); + case ARMISD::VLD1DUP: case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: Index: test/CodeGen/ARM/vlddup.ll =================================================================== --- test/CodeGen/ARM/vlddup.ll +++ test/CodeGen/ARM/vlddup.ll @@ -10,6 +10,84 @@ ret <8 x i8> %tmp3 } +define <8 x i8> @vld1dupi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind { +entry: +;CHECK-LABEL: vld1dupi8_preinc: +;CHECK: vld1.8 {d16[]}, [r1] + %0 = load i8*, i8** %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b + %1 = load i8, i8* %add.ptr, align 1 + %2 = insertelement <8 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + store i8* %add.ptr, i8** %a, align 4 + ret <8 x i8> %lane +} + +define <8 x i8> @vld1dupi8_postinc_fixed(i8** noalias nocapture %a) nounwind { +entry: +;CHECK-LABEL: vld1dupi8_postinc_fixed: +;CHECK: vld1.8 {d16[]}, [r1]! + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <8 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 1 + store i8* %add.ptr, i8** %a, align 4 + ret <8 x i8> %lane +} + +define <8 x i8> @vld1dupi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind { +entry: +;CHECK-LABEL: vld1dupi8_postinc_register: +;CHECK: vld1.8 {d16[]}, [r2], r1 + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <8 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n + store i8* %add.ptr, i8** %a, align 4 + ret <8 x i8> %lane +} + +define <16 x i8> @vld1dupqi8_preinc(i8** noalias nocapture %a, i32 %b) nounwind { +entry: +;CHECK-LABEL: vld1dupqi8_preinc: +;CHECK: vld1.8 {d16[], d17[]}, [r1] + %0 = load i8*, i8** %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b + %1 = load i8, i8* %add.ptr, align 1 + %2 = insertelement <16 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + store i8* %add.ptr, i8** %a, align 4 + ret <16 x i8> %lane +} + +define <16 x i8> @vld1dupqi8_postinc_fixed(i8** noalias nocapture %a) nounwind { +entry: +;CHECK-LABEL: vld1dupqi8_postinc_fixed: +;CHECK: vld1.8 {d16[], d17[]}, [r1]! + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <16 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 1 + store i8* %add.ptr, i8** %a, align 4 + ret <16 x i8> %lane +} + +define <16 x i8> @vld1dupqi8_postinc_register(i8** noalias nocapture %a, i32 %n) nounwind { +entry: +;CHECK-LABEL: vld1dupqi8_postinc_register: +;CHECK: vld1.8 {d16[], d17[]}, [r2], r1 + %0 = load i8*, i8** %a, align 4 + %1 = load i8, i8* %0, align 1 + %2 = insertelement <16 x i8> undef, i8 %1, i32 0 + %lane = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n + store i8* %add.ptr, i8** %a, align 4 + ret <16 x i8> %lane +} + define <4 x i16> @vld1dupi16(i16* %A) nounwind { ;CHECK-LABEL: vld1dupi16: ;Check the alignment value. Max for this instruction is 16 bits: @@ -75,6 +153,63 @@ ret <8 x i8> %tmp5 } +define void @vld2dupi8_preinc(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %b) nounwind { +;CHECK-LABEL: vld2dupi8_preinc: +;CHECK: vld2.8 {d16[], d17[]}, [r2] +entry: + %0 = load i8*, i8** %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %b + %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %add.ptr, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + store i8* %add.ptr, i8** %a, align 4 + %r.sroa.0.0..sroa_idx8 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 + store <8 x i8> %lane, <8 x i8>* %r.sroa.0.0..sroa_idx8, align 8 + %r.sroa.4.0..sroa_idx11 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 + store <8 x i8> %lane1, <8 x i8>* %r.sroa.4.0..sroa_idx11, align 8 + ret void +} + +define void @vld2dupi8_postinc_fixed(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a) nounwind { +entry: +;CHECK-LABEL: vld2dupi8_postinc_fixed: +;CHECK: vld2.8 {d16[], d17[]}, [r2]! + %0 = load i8*, i8** %a, align 4 + %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 2 + store i8* %add.ptr, i8** %a, align 4 + %r.sroa.0.0..sroa_idx7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 + store <8 x i8> %lane, <8 x i8>* %r.sroa.0.0..sroa_idx7, align 8 + %r.sroa.4.0..sroa_idx10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 + store <8 x i8> %lane1, <8 x i8>* %r.sroa.4.0..sroa_idx10, align 8 + ret void +} + +define void @vld2dupi8_postinc_variable(%struct.__neon_int8x8x2_t* noalias nocapture sret %agg.result, i8** noalias nocapture %a, i32 %n) nounwind { +entry: +;CHECK-LABEL: vld2dupi8_postinc_variable: +;CHECK: vld2.8 {d16[], d17[]}, [r3], r2 + %0 = load i8*, i8** %a, align 4 + %vld_dup = tail call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %0, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1) + %1 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer + %2 = extractvalue %struct.__neon_int8x8x2_t %vld_dup, 1 + %lane1 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer + %add.ptr = getelementptr inbounds i8, i8* %0, i32 %n + store i8* %add.ptr, i8** %a, align 4 + %r.sroa.0.0..sroa_idx7 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 0 + store <8 x i8> %lane, <8 x i8>* %r.sroa.0.0..sroa_idx7, align 8 + %r.sroa.4.0..sroa_idx10 = getelementptr inbounds %struct.__neon_int8x8x2_t, %struct.__neon_int8x8x2_t* %agg.result, i32 0, i32 1 + store <8 x i8> %lane1, <8 x i8>* %r.sroa.4.0..sroa_idx10, align 8 + ret void +} + define <4 x i16> @vld2dupi16(i8* %A) nounwind { ;CHECK-LABEL: vld2dupi16: ;Check that a power-of-two alignment smaller than the total size of the memory Index: test/CodeGen/ARM/vmul.ll =================================================================== --- test/CodeGen/ARM/vmul.ll +++ test/CodeGen/ARM/vmul.ll @@ -635,13 +635,26 @@ ret void } -define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind { -; Look for doing a normal scalar FP load rather than an to-all-lanes load. -; e.g., "ldr s0, [r2]" rathern than "vld1.32 {d18[], d19[]}, [r2:32]" -; Then check that the vector multiply has folded the splat to all lanes -; and used a vector * scalar instruction. -; CHECK: vldr {{s[0-9]+}}, [r2] +define void @fmul_splat(<4 x float> * %a, <4 x float>* nocapture %dst, float %tmp) nounwind { +; Look for a scalar float rather than a splat, then a vector*scalar multiply. +; CHECK: vmov s0, r2 ; CHECK: vmul.f32 q8, q8, d0[0] + %tmp5 = load <4 x float>, <4 x float>* %a, align 4 + %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0 + %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1 + %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2 + %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3 + %tmp10 = fmul <4 x float> %tmp9, %tmp5 + store <4 x float> %tmp10, <4 x float>* %dst, align 4 + ret void +} + +define void @fmul_splat_load(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind { +; Look for doing a normal scalar FP load rather than an to-all-lanes load, +; then a vector*scalar multiply. +; FIXME: Temporarily broken due to splat representation changes. +; CHECK: vld1.32 {d18[], d19[]}, [r2:32] +; CHECK: vmul.f32 q8, q9, q8 %tmp = load float, float* %src, align 4 %tmp5 = load <4 x float>, <4 x float>* %a, align 4 %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0