Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -316,10 +316,6 @@ } } -static bool isOdd(unsigned Value) { - return Value & 1; -} - static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, unsigned CheckedOpcode) { return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; @@ -378,7 +374,7 @@ unsigned AltOpcode = getAltOpcode(Opcode); for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) + if (!sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode)) return InstructionsState(VL[0], 0, false); } } @@ -3510,22 +3506,26 @@ // Create shuffle to take alternate operations from the vector. // Also, gather up odd and even scalar ops to propagate IR flags to // each vector operation. - ValueList OddScalars, EvenScalars; + ValueList OpScalars, AltScalars; unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { - if (isOdd(i)) { + auto *OpInst = cast(E->Scalars[i]); + unsigned InstOpcode = OpInst->getOpcode(); + assert(sameOpcodeOrAlt(S.Opcode, AltOpcode, InstOpcode) && + "Unexpected main/alternate opcode"); + if (InstOpcode == AltOpcode) { Mask[i] = Builder.getInt32(e + i); - OddScalars.push_back(E->Scalars[i]); + AltScalars.push_back(E->Scalars[i]); } else { Mask[i] = Builder.getInt32(i); - EvenScalars.push_back(E->Scalars[i]); + OpScalars.push_back(E->Scalars[i]); } } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, EvenScalars); - propagateIRFlags(V1, OddScalars); + propagateIRFlags(V0, OpScalars); + propagateIRFlags(V1, AltScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) Index: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -270,22 +270,34 @@ ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_2:%.*]] = sub i32 [[V0_2]], [[V1_2]] ; CHECK-NEXT: [[TMP1_3:%.*]] = sub i32 [[V0_3]], [[V1_3]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP0_0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP0_2]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1_2]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP0_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_3]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1_3]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP14]]) -; CHECK-NEXT: ret i32 [[TMP15]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP2_2:%.*]] = add i32 [[TMP0_2]], [[TMP0_3]] +; CHECK-NEXT: [[TMP2_3:%.*]] = add i32 [[TMP1_2]], [[TMP1_3]] +; CHECK-NEXT: [[TMP3_0:%.*]] = lshr i32 [[TMP2_0]], 15 +; CHECK-NEXT: [[TMP3_1:%.*]] = lshr i32 [[TMP2_1]], 15 +; CHECK-NEXT: [[TMP3_2:%.*]] = lshr i32 [[TMP2_2]], 15 +; CHECK-NEXT: [[TMP3_3:%.*]] = lshr i32 [[TMP2_3]], 15 +; CHECK-NEXT: [[TMP4_0:%.*]] = and i32 [[TMP3_0]], 65537 +; CHECK-NEXT: [[TMP4_1:%.*]] = and i32 [[TMP3_1]], 65537 +; CHECK-NEXT: [[TMP4_2:%.*]] = and i32 [[TMP3_2]], 65537 +; CHECK-NEXT: [[TMP4_3:%.*]] = and i32 [[TMP3_3]], 65537 +; CHECK-NEXT: [[TMP5_0:%.*]] = mul nuw i32 [[TMP4_0]], 65535 +; CHECK-NEXT: [[TMP5_1:%.*]] = mul nuw i32 [[TMP4_1]], 65535 +; CHECK-NEXT: [[TMP5_2:%.*]] = mul nuw i32 [[TMP4_2]], 65535 +; CHECK-NEXT: [[TMP5_3:%.*]] = mul nuw i32 [[TMP4_3]], 65535 +; CHECK-NEXT: [[TMP6_0:%.*]] = add i32 [[TMP5_0]], [[TMP2_0]] +; CHECK-NEXT: [[TMP6_1:%.*]] = add i32 [[TMP5_1]], [[TMP2_1]] +; CHECK-NEXT: [[TMP6_2:%.*]] = add i32 [[TMP5_2]], [[TMP2_2]] +; CHECK-NEXT: [[TMP6_3:%.*]] = add i32 [[TMP5_3]], [[TMP2_3]] +; CHECK-NEXT: [[TMP7_0:%.*]] = xor i32 [[TMP6_0]], [[TMP5_0]] +; CHECK-NEXT: [[TMP7_1:%.*]] = xor i32 [[TMP6_1]], [[TMP5_1]] +; CHECK-NEXT: [[TMP7_2:%.*]] = xor i32 [[TMP6_2]], [[TMP5_2]] +; CHECK-NEXT: [[TMP7_3:%.*]] = xor i32 [[TMP6_3]], [[TMP5_3]] +; CHECK-NEXT: [[REDUCE_0:%.*]] = add i32 [[TMP7_1]], [[TMP7_0]] +; CHECK-NEXT: [[REDUCE_1:%.*]] = add i32 [[REDUCE_0]], [[TMP7_2]] +; CHECK-NEXT: [[REDUCE_2:%.*]] = add i32 [[REDUCE_1]], [[TMP7_3]] +; CHECK-NEXT: ret i32 [[REDUCE_2]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -182,25 +182,15 @@ } ; Function Attrs: nounwind uwtable -define void @No_faddfsub() #0 { -; CHECK-LABEL: @No_faddfsub( +define void @faddfsub_select() #0 { +; CHECK-LABEL: @faddfsub_select( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]] -; CHECK-NEXT: store float [[ADD]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 -; CHECK-NEXT: [[ADD1:%.*]] = fadd float [[TMP2]], [[TMP3]] -; CHECK-NEXT: store float [[ADD1]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 -; CHECK-NEXT: [[ADD2:%.*]] = fadd float [[TMP4]], [[TMP5]] -; CHECK-NEXT: store float [[ADD2]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 -; CHECK-NEXT: [[SUB:%.*]] = fsub float [[TMP6]], [[TMP7]] -; CHECK-NEXT: store float [[SUB]], float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4 ; CHECK-NEXT: ret void ; entry: