This change allows to restore ones removed instruction by InstCombine pass in order to vectorize AVX 256-bit shuffle oration.
Also, dereferenceable_or_null metadata now attributes to loads with no pointer types, but according to http://llvm.org/docs/LangRef.html#load-instruction this change looks correct to me:
"The optional !dereferenceable_or_null metadata must reference a single metadata name <deref_bytes_node> corresponding to a metadata node with one i64 entry. The existence of the !dereferenceable_or_null metadata on the instruction tells the optimizer that the value loaded is known to be either dereferenceable or null. The number of bytes known to be dereferenceable is specified by the integer value in the metadata node."
So, for this code:
__m256d vsht_d4_fold(const double* ptr) {
__m256d foo = (__m256d){ ptr[0], ptr[1], ptr[2], ptr[3] }; return __builtin_shufflevector( foo, foo, 0, 0, 2, 2 );
}
Currently, after InstCombine pass we have:
define <4 x double> @load_four_scalars_but_use_two(double* %ptr) #0 {
%arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 %ld0 = load double, double* %ptr, align 8 %ld2 = load double, double* %arrayidx2, align 8 %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 %ins2 = insertelement <4 x double> %ins0, double %ld2, i32 2 %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> ret <4 x double> %shuffle
}
that results to this assembler in output:
vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd 16(%rdi), %xmm1 # xmm1 = mem[0],zero
vinsertf128 $1, %xmm1, %ymm0, %ymm0
vmovddup %ymm0, %ymm0 # ymm0 = ymm0[0,0,2,2]
retq
after this change we have following output after InstCombine:
define <4 x double> @load_four_scalars_but_use_two(double* nocapture readonly %ptr) local_unnamed_addr #0 {
%arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 %ld0 = load double, double* %ptr, align 8, !dereferenceable_or_null !0 %ld2 = load double, double* %arrayidx2, align 8, !dereferenceable_or_null !1 %ins0 = insertelement <4 x double> undef, double %ld0, i32 0 %ins2 = insertelement <4 x double> %ins0, double %ld2, i32 2 %shuffle = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> ret <4 x double> %shuffle
}
and following IR after SLP Vectorizer:
define <4 x double> @load_four_scalars_but_use_two(double* nocapture readonly %ptr) local_unnamed_addr #0 {
%arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2 %1 = getelementptr double, double* %ptr, i64 1 %2 = getelementptr double, double* %ptr, i64 3 %3 = bitcast double* %ptr to <4 x double>* %4 = load <4 x double>, <4 x double>* %3, align 8 %5 = extractelement <4 x double> %4, i32 0 %ins0 = insertelement <4 x double> undef, double %5, i32 0 %6 = extractelement <4 x double> %4, i32 1 %7 = insertelement <4 x double> %ins0, double %6, i64 1 %8 = extractelement <4 x double> %4, i32 2 %ins2 = insertelement <4 x double> %7, double %8, i32 2 %9 = extractelement <4 x double> %4, i32 3 %10 = insertelement <4 x double> %ins2, double %9, i64 3 %shuffle = shufflevector <4 x double> %10, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> ret <4 x double> %shuffle
}
Which allows us to have this assembler in output:
vmovddup (%rdi), %ymm0 # ymm0 = mem[0,0,2,2]
retq
You really shouldn't have global variables like this, these need to go and be embedded in the combine instead.