Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -5050,8 +5050,16 @@ if (M.size() != NumElts && M.size() != NumElts*2) return false; - // If the mask is twice as long as the result then we need to check the upper - // and lower parts of the mask + // If the mask is twice as long as the input vector then we need to check that + // the upper part of the mask has smaller values than the lower part. + if (M.size() == NumElts * 2) + for (unsigned i = 0; i < NumElts; ++i) + if (M[i] >= 0 && M[i + NumElts] >= 0 && + (unsigned) M[i] >= (unsigned) M[i+NumElts]) + return false; + + // If the mask is twice as long as the input vector then we need to check the + // upper and lower parts of the mask for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += 2) { @@ -5079,6 +5087,12 @@ if (M.size() != NumElts && M.size() != NumElts*2) return false; + if (M.size() == NumElts * 2) + for (unsigned i = 0; i < NumElts; ++i) + if (M[i] >= 0 && M[i + NumElts] >= 0 && + (unsigned) M[i] >= (unsigned) M[i+NumElts]) + return false; + for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = M[i] == 0 ? 0 : 1; for (unsigned j = 0; j < NumElts; j += 2) { Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -371,3 +371,37 @@ %rv = select <8 x i1> %c, <8 x i8> %tr0, <8 x i8> %tr1 ret <8 x i8> %rv } + +define void @vrtn_with_undef_lower_twice(<4 x i32>* %A, <4 x i32>* %B, <8 x i32>* %C) { +entry: + ; CHECK-LABEL: vrtn_with_undef_lower_twice + ; CHECK: @ BB#0: + ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] + ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] + ; CHECK-NEXT: vtrn.32 q9, q8 + ; CHECK-NEXT: vst1.32 {d16, d17}, [r2:128]! + ; CHECK-NEXT: vst1.64 {d16, d17}, [r2:128] + ; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %0 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + store <8 x i32> %0, <8 x i32>* %C + ret void +} + +define void @vrtn_with_undef_upper_twice(<4 x i32>* %A, <4 x i32>* %B, <8 x i32>* %C) { +entry: + ; CHECK-LABEL: vrtn_with_undef_upper_twice + ; CHECK: @ BB#0: + ; CHECK-NEXT: vld1.64 {d16, d17}, [r1] + ; CHECK-NEXT: vld1.64 {d18, d19}, [r0] + ; CHECK-NEXT: vtrn.32 q9, q8 + ; CHECK-NEXT: vst1.32 {d18, d19}, [r2:128]! + ; CHECK-NEXT: vst1.64 {d18, d19}, [r2:128] + ; CHECK-NEXT: mov pc, lr + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp2 = load <4 x i32>, <4 x i32>* %B + %0 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <8 x i32> + store <8 x i32> %0, <8 x i32>* %C + ret void +} Index: test/CodeGen/ARM/vuzp.ll =================================================================== --- test/CodeGen/ARM/vuzp.ll +++ test/CodeGen/ARM/vuzp.ll @@ -286,6 +286,19 @@ ret <4 x i32> %0 } +define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) { +entry: + ; CHECK-LABEL: vuzp_rev_shufflemask_vtrn + ; CHECK-NOT: vtrn + ; CHECK: vuzp + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = load <2 x i32>, <2 x i32>* %B + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <4 x i32> + store <4 x i32> %0, <4 x i32>* %C + ret void +} + + define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) { ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8. ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -295,3 +295,13 @@ ret <4 x i32> %0 } +define void @vzip_undef_rev_shufflemask_vtrn(<2 x i32>* %A, <4 x i32>* %B) { +entry: + ; CHECK-LABEL: vzip_undef_rev_shufflemask_vtrn + ; CHECK-NOT: vtrn + ; CHECK: vzip + %tmp1 = load <2 x i32>, <2 x i32>* %A + %0 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> + store <4 x i32> %0, <4 x i32>* %B + ret void +}