Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -5045,8 +5045,16 @@ if (M.size() != NumElts && M.size() != NumElts*2) return false; - // If the mask is twice as long as the result then we need to check the upper - // and lower parts of the mask + // If the mask is twice as long as the input vector then we need to check that + // the upper part of the mask has smaller values than the lower part. + if (M.size() == NumElts * 2) + for (unsigned i = 0; i < NumElts; ++i) + if (M[i] >= 0 && M[i + NumElts] >= 0 && + (unsigned) M[i] >= (unsigned) M[i+NumElts]) + return false; + + // If the mask is twice as long as the input vector then we need to check the + // upper and lower parts of the mask for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += 2) { @@ -5074,6 +5082,12 @@ if (M.size() != NumElts && M.size() != NumElts*2) return false; + if (M.size() == NumElts * 2) + for (unsigned i = 0; i < NumElts; ++i) + if (M[i] >= 0 && M[i + NumElts] >= 0 && + (unsigned) M[i] >= (unsigned) M[i+NumElts]) + return false; + for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += 2) { Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -286,6 +286,19 @@ ret <4 x i32> %0 } +define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { +entry: + ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn + ; CHECK-NOT: vtrn + ; CHECK: vuzp + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> + store <4 x i32> %0, <4 x i32>* %C + ret void +} + + define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -295,3 +295,13 @@ ret <4 x i32> %0 } +define void @vzip_undef_rev_shufflemask_vtrn(<2 x i32>* %A, <4 x i32>* %B) { +entry: + ; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn + ; CHECK-NOT: vtrn + ; CHECK: vzip + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + store <4 x i32> %0, <4 x i32>* %B + ret void +}