Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3089,6 +3089,7 @@ // the 'X' node here can either be nothing or an extract_vector_elt to catch // more cases. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && N0.getOperand(0).getOpcode() == ISD::LOAD) || N0.getOpcode() == ISD::LOAD) { LoadSDNode *Load = cast( (N0.getOpcode() == ISD::LOAD) ? Index: test/CodeGen/ARM/dagcombine-anyexttozeroext.ll =================================================================== --- test/CodeGen/ARM/dagcombine-anyexttozeroext.ll +++ test/CodeGen/ARM/dagcombine-anyexttozeroext.ll @@ -19,12 +19,48 @@ ret float %7 } +; CHECK-LABEL: g: define float @g(<4 x i16>* nocapture %in) { ; CHECK: vldr %1 = load <4 x i16>, <4 x i16>* %in - ; CHECK-NOT: uxth + + ; For now we're generating a vmov.16 and a uxth instruction. + ; The uxth is redundant, and we should be able to extend without + ; having to generate cross-domain copies. Once we can do this + ; we should modify the checks below. + + ; CHECK: uxth %2 = extractelement <4 x i16> %1, i32 0 ; CHECK: vcvt.f32.u32 %3 = uitofp i16 %2 to float ret float %3 } + +; The backend generates for the following code an +; (and 0xff (i32 extract_vector_elt (zext load <4 x i8> to 4 x i16))) +; +; The and is not redundant and cannot be removed. Since +; extract_vector_elt is doing an implicit any_ext, the and +; is required to guarantee that the top bits are set to zero. + +; Ideally should be a zext from <4 x i8> to <4 x 32>. + +; CHECK-LABEL: h: +; CHECK: vld1.32 +; CHECK: uxtb +define <4 x i32> @h(<4 x i8> *%in) { + %1 = load <4 x i8>, <4 x i8>* %in, align 4 + %2 = extractelement <4 x i8> %1, i32 0 + %3 = zext i8 %2 to i32 + %4 = insertelement <4 x i32> undef, i32 %3, i32 0 + %5 = extractelement <4 x i8> %1, i32 1 + %6 = zext i8 %5 to i32 + %7 = insertelement <4 x i32> %4, i32 %6, i32 1 + %8 = extractelement <4 x i8> %1, i32 2 + %9 = zext i8 %8 to i32 + %10 = insertelement <4 x i32> %7, i32 %9, i32 2 + %11 = extractelement <4 x i8> %1, i32 3 + %12 = zext i8 %11 to i32 + %13 = insertelement <4 x i32> %10, i32 %12, i32 3 + ret <4 x i32> %13 +}