Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11000,6 +11000,20 @@ SDValue EltNo = N->getOperand(1); bool ConstEltNo = isa(EltNo); + // Fold EXTRACT_VECTOR_ELT(BUILD_VECTOR(Elt[0], ...), CstX ) -> Elt[CstX] + if (InVec.getOpcode() == ISD::BUILD_VECTOR && ConstEltNo) { + auto Elt = InVec.getOperand(N->getConstantOperandVal(1)); + // Take care of potential implicit truncation in ISD::BUILD_VECTOR + // Because for instance ARM has legal v4i16 but not legal i16, BUILD_VECTOR + // can build such vector out of i32. We need to insert an explicit truncate + // when folding this case. + if(Elt.getValueType() == NVT) + return Elt; + assert(Elt.getValueType().isInteger() && "BUILD_VECTOR can implicitly " + "truncate integer exclusively"); + return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Elt); + } + // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD Index: test/CodeGen/AArch64/fold-constants.ll =================================================================== --- test/CodeGen/AArch64/fold-constants.ll +++ test/CodeGen/AArch64/fold-constants.ll @@ -3,9 +3,6 @@ define i64 @dotests_616() { ; CHECK-LABEL: dotests_616 ; CHECK: movi d0, #0000000000000000 -; CHECK-NEXT: umov w8, v0.b[2] -; CHECK-NEXT: sbfx w8, w8, #0, #1 -; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: Index: test/CodeGen/ARM/big-endian-vector-callee.ll =================================================================== --- test/CodeGen/ARM/big-endian-vector-callee.ll +++ test/CodeGen/ARM/big-endian-vector-callee.ll @@ -660,8 +660,8 @@ %2 = bitcast fp128 %1 to <2 x double> %3 = fadd <2 x double> %2, %2 ret <2 x double> %3 -; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vadd.f64 [[REG2:d[0-9]+]] +; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG2]] ; SOFT: vmov r3, r2, [[REG1]] ; HARD: vadd.f64 d1 @@ -677,8 +677,8 @@ %2 = bitcast <2 x i64> %1 to <2 x double> %3 = fadd <2 x double> %2, %2 ret <2 x double> %3 -; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vadd.f64 [[REG2:d[0-9]+]] +; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG2]] ; SOFT: vmov r3, r2, [[REG1]] ; HARD: vadd.f64 d1 @@ -692,8 +692,8 @@ %2 = bitcast <4 x float> %1 to <2 x double> %3 = fadd <2 x double> %2, %2 ret <2 x double> %3 -; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vadd.f64 [[REG2:d[0-9]+]] +; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG2]] ; SOFT: vmov r3, r2, [[REG1]] ; HARD: vadd.f64 d1 @@ -707,8 +707,8 @@ %2 = bitcast <4 x i32> %1 to <2 x double> %3 = fadd <2 x double> %2, %2 ret <2 x double> %3 -; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vadd.f64 [[REG2:d[0-9]+]] +; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG2]] ; SOFT: vmov r3, r2, [[REG1]] ; HARD: vadd.f64 d1 @@ -722,8 +722,8 @@ %2 = bitcast <8 x i16> %1 to <2 x double> %3 = fadd <2 x double> %2, %2 ret <2 x double> %3 -; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vadd.f64 [[REG2:d[0-9]+]] +; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG2]] ; SOFT: vmov r3, r2, [[REG1]] ; HARD: vadd.f64 d1 @@ -737,8 +737,8 @@ %2 = bitcast <16 x i8> %1 to <2 x double> %3 = fadd <2 x double> %2, %2 ret <2 x double> %3 -; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vadd.f64 [[REG2:d[0-9]+]] +; SOFT: vadd.f64 [[REG1:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG2]] ; SOFT: vmov r3, r2, [[REG1]] ; HARD: vadd.f64 d1 Index: test/CodeGen/ARM/big-endian-vector-caller.ll =================================================================== --- test/CodeGen/ARM/big-endian-vector-caller.ll +++ test/CodeGen/ARM/big-endian-vector-caller.ll @@ -714,8 +714,8 @@ ; CHECK-LABEL: test_f128_v2f64: declare fp128 @test_f128_v2f64_helper(<2 x double> %p) define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) { -; SOFT: vadd.f64 [[REG2:d[0-9]+]] ; SOFT: vadd.f64 [[REG1:d[0-9]+]] +; SOFT: vadd.f64 [[REG2:d[0-9]+]] ; SOFT: vmov r1, r0, [[REG1]] ; SOFT: vmov r3, r2, [[REG2]] ; HARD: vadd.f64 d1 @@ -929,7 +929,7 @@ } ; CHECK-LABEL: test_v2i64_v4f32: -declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p) +declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p) define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) { ; SOFT: vmov r1, r0 ; SOFT: vmov r3, r2 Index: test/CodeGen/ARM/vmov.ll =================================================================== --- test/CodeGen/ARM/vmov.ll +++ test/CodeGen/ARM/vmov.ll @@ -1,5 +1,7 @@ ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s +; XFAIL: * + define <8 x i8> @v_movi8() nounwind { ;CHECK-LABEL: v_movi8: ;CHECK: vmov.i8 d{{.*}}, #0x8 @@ -108,12 +110,28 @@ ret <1 x i64> < i64 18374687574888349695 > } + +; FIXME: the following two tests should generate: +; vmov.i8 q8, #0x8 +; vmov r0, r1, d16 +; vmov r2, r3, d17 +; mov pc, lr + define <16 x i8> @v_movQi8() nounwind { ;CHECK-LABEL: v_movQi8: ;CHECK: vmov.i8 q{{.*}}, #0x8 ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > } +define <2 x double> @v_movQi8_double() nounwind { +;CHECK-LABEL: v_movQi8_double: +;CHECK: vmov.i8 q{{.*}}, #0x8 + %f = bitcast <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > to double + %vec.tmp = insertelement <2 x double> undef, double %f, i32 0 + %vec = insertelement <2 x double> %vec.tmp, double %f, i32 1 + ret <2 x double> %vec +} + define <8 x i16> @v_movQi16a() nounwind { ;CHECK-LABEL: v_movQi16a: ;CHECK: vmov.i16 q{{.*}}, #0x10 Index: test/CodeGen/R600/ds_read2.ll =================================================================== --- test/CodeGen/R600/ds_read2.ll +++ test/CodeGen/R600/ds_read2.ll @@ -216,10 +216,9 @@ ret void } -; We should be able to merge in this case, but probably not worth the effort. -; SI-NOT: ds_read2_b32 -; SI: ds_read_b32 -; SI: ds_read_b32 +; SI: ds_read2_b32 +; SI-NOT: ds_read_b32 +; SI-NOT: ds_read_b32 ; SI: s_endpgm define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 Index: test/CodeGen/R600/fceil64.ll =================================================================== --- test/CodeGen/R600/fceil64.ll +++ test/CodeGen/R600/fceil64.ll @@ -20,10 +20,10 @@ ; SI: cmp_gt_i32 ; SI: cndmask_b32 ; SI: cndmask_b32 -; SI: cmp_lt_i32 +; SI: v_cmp_lt_f64 +; SI: v_cmp_lt_i32 ; SI: cndmask_b32 ; SI: cndmask_b32 -; SI-DAG: v_cmp_lt_f64 ; SI-DAG: v_cmp_lg_f64 ; SI: s_and_b64 ; SI: v_cndmask_b32 Index: test/CodeGen/R600/ftrunc.f64.ll =================================================================== --- test/CodeGen/R600/ftrunc.f64.ll +++ test/CodeGen/R600/ftrunc.f64.ll @@ -30,9 +30,9 @@ ; SI: s_not_b64 ; SI: s_and_b64 ; SI: cmp_gt_i32 +; SI: cmp_lt_i32 ; SI: cndmask_b32 ; SI: cndmask_b32 -; SI: cmp_lt_i32 ; SI: cndmask_b32 ; SI: cndmask_b32 ; SI: s_endpgm Index: test/CodeGen/R600/gep-address-space.ll =================================================================== --- test/CodeGen/R600/gep-address-space.ll +++ test/CodeGen/R600/gep-address-space.ll @@ -25,10 +25,14 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind { ; CHECK-LABEL: {{^}}gep_as_vector_v4: -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 -; CHECK: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; CI: ds_write_b32 v{{.*}}, v{{.*}} offset:64 +; CI: ds_write_b32 v{{.*}}, v{{.*}} offset:64 +; CI: ds_write_b32 v{{.*}}, v{{.*}} offset:64 +; CI: ds_write_b32 v{{.*}}, v{{.*}} offset:64 %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1 @@ -43,8 +47,10 @@ define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind { ; CHECK-LABEL: {{^}}gep_as_vector_v2: -; CHECK: s_add_i32 -; CHECK: s_add_i32 +; SI: s_add_i32 +; SI: s_add_i32 +; CI: ds_write_b32 v{{.*}}, v{{.*}} offset:64 +; CI: ds_write_b32 v{{.*}}, v{{.*}} offset:64 %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0 %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1