diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -625,14 +625,11 @@ } else if (AlignType == VECTOR_ALIGN) { // By default, use natural alignment for vector types. This is consistent // with what clang and llvm-gcc do. - unsigned Alignment = - getTypeAllocSize(cast(Ty)->getElementType()); + // // We're only calculating a natural alignment, so it doesn't have to be // based on the full size for scalable vectors. Using the minimum element // count should be enough here. - Alignment *= cast(Ty)->getElementCount().getKnownMinValue(); - Alignment = PowerOf2Ceil(Alignment); - return Align(Alignment); + return Align(PowerOf2Ceil(getTypeStoreSize(Ty).getKnownMinSize())); } // If we still couldn't find a reasonable default alignment, fall back diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1971,9 +1971,6 @@ } defm Pat_Store_P16 : unpred_store_predicate; - defm Pat_Store_P8 : unpred_store_predicate; - defm Pat_Store_P4 : unpred_store_predicate; - defm Pat_Store_P2 : unpred_store_predicate; multiclass unpred_load_predicate { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), @@ -1984,9 +1981,6 @@ } defm Pat_Load_P16 : unpred_load_predicate; - defm Pat_Load_P8 : unpred_load_predicate; - defm Pat_Load_P4 : unpred_load_predicate; - defm Pat_Load_P2 : unpred_load_predicate; multiclass ld1 { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/ret-vec-promote.ll b/llvm/test/CodeGen/AArch64/GlobalISel/ret-vec-promote.ll --- a/llvm/test/CodeGen/AArch64/GlobalISel/ret-vec-promote.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/ret-vec-promote.ll @@ -7,7 +7,7 @@ ; CHECK: bb.1 (%ir-block.0): ; CHECK: liveins: $x0 ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 - ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s1>) = G_LOAD [[COPY]](p0) :: (load 1 from %ir.v, align 4) + ; CHECK: [[LOAD:%[0-9]+]]:_(<4 x s1>) = G_LOAD [[COPY]](p0) :: (load 1 from %ir.v) ; CHECK: [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[LOAD]](<4 x s1>) ; CHECK: $d0 = COPY [[ANYEXT]](<4 x s16>) ; CHECK: RET_ReallyLR implicit $d0 diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.ll b/llvm/test/CodeGen/AArch64/spillfill-sve.ll --- a/llvm/test/CodeGen/AArch64/spillfill-sve.ll +++ b/llvm/test/CodeGen/AArch64/spillfill-sve.ll @@ -404,8 +404,8 @@ define void @fill_nxv16i1() { ; CHECK-LABEL: fill_nxv16i1 -; CHECK-DAG: ldr p{{[01]}}, [sp, #8, mul vl] -; CHECK-DAG: ldr p{{[01]}}, [sp] +; CHECK-DAG: ldr p{{[01]}}, [sp, #7, mul vl] +; CHECK-DAG: ldr p{{[01]}}, [sp, #6, mul vl] %local0 = alloca %local1 = alloca load volatile , * %local0 @@ -413,81 +413,15 @@ ret void } -define void @fill_nxv8i1() { -; CHECK-LABEL: fill_nxv8i1 -; CHECK-DAG: ldr p{{[01]}}, [sp, #4, mul vl] -; CHECK-DAG: ldr p{{[01]}}, [sp] - %local0 = alloca - %local1 = alloca - load volatile , * %local0 - load volatile , * %local1 - ret void -} - -define void @fill_nxv4i1() { -; CHECK-LABEL: fill_nxv4i1 -; CHECK-DAG: ldr p{{[01]}}, [sp, #6, mul vl] -; CHECK-DAG: ldr p{{[01]}}, [sp, #4, mul vl] - %local0 = alloca - %local1 = alloca - load volatile , * %local0 - load volatile , * %local1 - ret void -} - -define void @fill_nxv2i1() { -; CHECK-LABEL: fill_nxv2i1 -; CHECK-DAG: ldr p{{[01]}}, [sp, #7, mul vl] -; CHECK-DAG: ldr p{{[01]}}, [sp, #6, mul vl] - %local0 = alloca - %local1 = alloca - load volatile , * %local0 - load volatile , * %local1 - ret void -} - ; Predicate spills define void @spill_nxv16i1( %v0, %v1) { ; CHECK-LABEL: spill_nxv16i1 -; CHECK-DAG: str p{{[01]}}, [sp, #8, mul vl] -; CHECK-DAG: str p{{[01]}}, [sp] +; CHECK-DAG: str p{{[01]}}, [sp, #7, mul vl] +; CHECK-DAG: str p{{[01]}}, [sp, #6, mul vl] %local0 = alloca %local1 = alloca store volatile %v0, * %local0 store volatile %v1, * %local1 ret void } - -define void @spill_nxv8i1( %v0, %v1) { -; CHECK-LABEL: spill_nxv8i1 -; CHECK-DAG: str p{{[01]}}, [sp, #4, mul vl] -; CHECK-DAG: str p{{[01]}}, [sp] - %local0 = alloca - %local1 = alloca - store volatile %v0, * %local0 - store volatile %v1, * %local1 - ret void -} - -define void @spill_nxv4i1( %v0, %v1) { -; CHECK-LABEL: spill_nxv4i1 -; CHECK-DAG: str p{{[01]}}, [sp, #6, mul vl] -; CHECK-DAG: str p{{[01]}}, [sp, #4, mul vl] - %local0 = alloca - %local1 = alloca - store volatile %v0, * %local0 - store volatile %v1, * %local1 - ret void -} - -define void @spill_nxv2i1( %v0, %v1) { -; CHECK-LABEL: spill_nxv2i1 -; CHECK-DAG: str p{{[01]}}, [sp, #7, mul vl] -; CHECK-DAG: str p{{[01]}}, [sp, #6, mul vl] - %local0 = alloca - %local1 = alloca - store volatile %v0, * %local0 - store volatile %v1, * %local1 - ret void -} diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll @@ -47,22 +47,22 @@ ; p3 = %p3 ; x0 = &%p4 ; x1 = &%p5 -define aarch64_sve_vector_pcs @callee_with_many_svepred_arg( %p0, %p1, %p2, %p3, %p4, %p5) { +define aarch64_sve_vector_pcs @callee_with_many_svepred_arg( %p0, %p1, %p2, %p3, %p4, %p5) { ; CHECK: name: callee_with_many_svepred_arg ; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1 ; CHECK-DAG: [[RES:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 ; CHECK-DAG: $p0 = COPY [[RES]] ; CHECK: RET_ReallyLR implicit $p0 - ret %p5 + ret %p5 } ; Test that p4 and p5 are passed by reference. -define aarch64_sve_vector_pcs @caller_with_many_svepred_arg( %p) { +define aarch64_sve_vector_pcs @caller_with_many_svepred_arg( %p) { ; CHECK: name: caller_with_many_svepred_arg ; CHECK: stack: -; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 1, alignment: 4, +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2, ; CHECK-NEXT: stack-id: sve-vec -; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 1, alignment: 4, +; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 2, alignment: 2, ; CHECK-NEXT: stack-id: sve-vec ; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0 ; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0 @@ -72,8 +72,8 @@ ; CHECK-DAG: $x1 = COPY [[BASE2]] ; CHECK-NEXT: BL @callee_with_many_svepred_arg ; CHECK: RET_ReallyLR implicit $p0 - %ret = call aarch64_sve_vector_pcs @callee_with_many_svepred_arg( %p, %p, %p, %p, %p, %p) - ret %ret + %ret = call aarch64_sve_vector_pcs @callee_with_many_svepred_arg( %p, %p, %p, %p, %p, %p) + ret %ret } ; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack. diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll --- a/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -916,33 +916,35 @@ define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 { ; HSA-LABEL: @kern_realign_i1_v3i1( -; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0 ; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* ; HSA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 ; HSA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 -; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4 +; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0 ; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* -; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 -; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3 -; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1> +; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0 +; HSA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; HSA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3 +; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1> ; HSA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1 -; HSA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4 +; HSA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 1 ; HSA-NEXT: ret void ; ; MESA-LABEL: @kern_realign_i1_v3i1( -; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() +; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* ; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1 -; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 40 +; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0 -; MESA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3 -; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1> +; MESA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 +; MESA-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 8 +; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i3 +; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP5]] to <3 x i1> ; MESA-NEXT: store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1 -; MESA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4 +; MESA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 1 ; MESA-NEXT: ret void ; store volatile i1 %arg0, i1 addrspace(1)* undef diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -479,8 +479,9 @@ ; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp une <2 x half> %a, %b @@ -500,8 +501,9 @@ ; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ueq <2 x half> %a, %b @@ -521,8 +523,9 @@ ; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ugt <2 x half> %a, %b @@ -542,8 +545,9 @@ ; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp uge <2 x half> %a, %b @@ -563,8 +567,9 @@ ; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ult <2 x half> %a, %b @@ -584,8 +589,9 @@ ; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ule <2 x half> %a, %b @@ -606,8 +612,9 @@ ; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp uno <2 x half> %a, %b @@ -627,8 +634,9 @@ ; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp one <2 x half> %a, %b @@ -648,8 +656,9 @@ ; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp oeq <2 x half> %a, %b @@ -669,8 +678,9 @@ ; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ogt <2 x half> %a, %b @@ -690,8 +700,9 @@ ; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp oge <2 x half> %a, %b @@ -711,8 +722,9 @@ ; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp olt <2 x half> %a, %b @@ -732,8 +744,9 @@ ; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ole <2 x half> %a, %b @@ -753,8 +766,9 @@ ; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] ; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] ; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-NEXT: st.param.b8 [func_retval0+0], [[R0]]; ; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: st.param.b8 [func_retval0+1], [[R1]]; ; CHECK-NEXT: ret; define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { %r = fcmp ord <2 x half> %a, %b diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -66,20 +66,20 @@ } ; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_v3i1( -; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] +; CHECK-NEXT: .param .align 1 .b8 test_v3i1_param_0[1] ; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; -; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: ld.param.u8 [[E0:%rs[0-9]+]], [test_v3i1_param_0] +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK-DAG: st.param.b8 [param0+0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v3i1, -; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0+0]; ; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK-DAG: st.param.b8 [func_retval0+0], [[RE0]] ; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; ; CHECK-NEXT: ret; define <3 x i1> @test_v3i1(<3 x i1> %a) { @@ -87,37 +87,43 @@ ret <3 x i1> %r; } -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_v4i1( -; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] -; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1] +; CHECK: ld.param.u8 [[E0:%rs[0-9]+]], [test_v4i1_param_0] +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[E0]]; +; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), ; CHECK: test_v4i1, -; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; +; CHECK: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0+0]; +; CHECK: ld.param.b8 [[RE1:%rs[0-9]+]], [retval0+1]; +; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK: ld.param.b8 [[RE3:%rs[0-9]+]], [retval0+3]; +; CHECK: st.param.b8 [func_retval0+0], [[RE0]]; +; CHECK: st.param.b8 [func_retval0+1], [[RE1]]; +; CHECK: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK: st.param.b8 [func_retval0+3], [[RE3]]; ; CHECK-NEXT: ret; define <4 x i1> @test_v4i1(<4 x i1> %a) { %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); ret <4 x i1> %r; } -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_v5i1( -; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] +; CHECK-NEXT: .param .align 1 .b8 test_v5i1_param_0[1] ; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; -; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: ld.param.u8 [[E0:%rs[0-9]+]], [test_v5i1_param_0] +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK-DAG: st.param.b8 [param0+0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_v5i1, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0+0]; ; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+0], [[RE0]] ; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i1> @test_v5i1(<5 x i1> %a) { diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -836,8 +836,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: mov.w lr, #0 ; CHECK-NEXT: @ implicit-def: $q1 @@ -946,7 +946,7 @@ ; CHECK-NEXT: itt mi ; CHECK-NEXT: vmovmi r1, s3 ; CHECK-NEXT: strmi r1, [r0, #12] -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r7, pc} entry: %0 = load <4 x i16>, <4 x i16>* %mask, align 2 @@ -962,8 +962,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: mov.w lr, #0 ; CHECK-NEXT: @ implicit-def: $q1 @@ -1072,7 +1072,7 @@ ; CHECK-NEXT: itt mi ; CHECK-NEXT: vmovmi r1, s3 ; CHECK-NEXT: strmi r1, [r0, #12] -; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r7, pc} entry: %0 = load <4 x i16>, <4 x i16>* %mask, align 2 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -557,8 +557,8 @@ define arm_aapcs_vfpcc <8 x i16> @masked_v8i16_align1_undef(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #8 -; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r1, p0 @@ -620,13 +620,13 @@ ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: ldrhmi r0, [r0, #14] ; CHECK-LE-NEXT: vmovmi.16 q0[7], r0 -; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #8 -; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 @@ -690,7 +690,7 @@ ; CHECK-BE-NEXT: ldrhmi r0, [r0, #14] ; CHECK-BE-NEXT: vmovmi.16 q1[7], r0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 -; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <8 x i16> %a, zeroinitializer @@ -1433,8 +1433,8 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8f16_align1_undef: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #40 -; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: .pad #36 +; CHECK-LE-NEXT: sub sp, #36 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: @ implicit-def: $q0 ; CHECK-LE-NEXT: vmrs r1, p0 @@ -1488,7 +1488,7 @@ ; CHECK-LE-NEXT: lsls r1, r1, #24 ; CHECK-LE-NEXT: bmi .LBB45_16 ; CHECK-LE-NEXT: .LBB45_8: @ %else20 -; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: add sp, #36 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB45_9: @ %cond.load ; CHECK-LE-NEXT: ldrh r2, [r0] @@ -1550,13 +1550,13 @@ ; CHECK-LE-NEXT: vldr.16 s4, [sp] ; CHECK-LE-NEXT: vmov r0, s4 ; CHECK-LE-NEXT: vmov.16 q0[7], r0 -; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: add sp, #36 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align1_undef: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #40 -; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: .pad #36 +; CHECK-BE-NEXT: sub sp, #36 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: @ implicit-def: $q1 @@ -1618,7 +1618,7 @@ ; CHECK-BE-NEXT: vmov.16 q1[7], r0 ; CHECK-BE-NEXT: .LBB45_9: @ %else20 ; CHECK-BE-NEXT: vrev64.16 q0, q1 -; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: add sp, #36 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB45_10: @ %cond.load ; CHECK-BE-NEXT: ldrh r2, [r0] diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -185,8 +185,8 @@ define arm_aapcs_vfpcc void @masked_v8i16_align1(<8 x i16> *%dest, <8 x i16> %a) { ; CHECK-LE-LABEL: masked_v8i16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #8 -; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.s16 gt, q0, zr ; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: and r2, r1, #1 @@ -247,13 +247,13 @@ ; CHECK-LE-NEXT: itt mi ; CHECK-LE-NEXT: vmovmi.u16 r1, q0[7] ; CHECK-LE-NEXT: strhmi r1, [r0, #14] -; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8i16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #8 -; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.s16 gt, q1, zr ; CHECK-BE-NEXT: vmrs r1, p0 @@ -315,7 +315,7 @@ ; CHECK-BE-NEXT: itt mi ; CHECK-BE-NEXT: vmovmi.u16 r1, q1[7] ; CHECK-BE-NEXT: strhmi r1, [r0, #14] -; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp sgt <8 x i16> %a, zeroinitializer @@ -646,8 +646,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %a, <8 x i16> %b) { ; CHECK-LE-LABEL: masked_v8f16_align1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #40 -; CHECK-LE-NEXT: sub sp, #40 +; CHECK-LE-NEXT: .pad #36 +; CHECK-LE-NEXT: sub sp, #36 ; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr ; CHECK-LE-NEXT: movs r2, #0 ; CHECK-LE-NEXT: vmrs r1, p0 @@ -700,7 +700,7 @@ ; CHECK-LE-NEXT: lsls r1, r1, #24 ; CHECK-LE-NEXT: bmi .LBB16_16 ; CHECK-LE-NEXT: .LBB16_8: @ %else14 -; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: add sp, #36 ; CHECK-LE-NEXT: bx lr ; CHECK-LE-NEXT: .LBB16_9: @ %cond.store ; CHECK-LE-NEXT: vstr.16 s0, [sp, #28] @@ -752,13 +752,13 @@ ; CHECK-LE-NEXT: vstr.16 s0, [sp] ; CHECK-LE-NEXT: ldrh.w r1, [sp] ; CHECK-LE-NEXT: strh r1, [r0, #14] -; CHECK-LE-NEXT: add sp, #40 +; CHECK-LE-NEXT: add sp, #36 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: masked_v8f16_align1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #40 -; CHECK-BE-NEXT: sub sp, #40 +; CHECK-BE-NEXT: .pad #36 +; CHECK-BE-NEXT: sub sp, #36 ; CHECK-BE-NEXT: vrev64.16 q2, q1 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.i16 ne, q2, zr @@ -813,7 +813,7 @@ ; CHECK-BE-NEXT: lsls r1, r1, #24 ; CHECK-BE-NEXT: bmi .LBB16_16 ; CHECK-BE-NEXT: .LBB16_8: @ %else14 -; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: add sp, #36 ; CHECK-BE-NEXT: bx lr ; CHECK-BE-NEXT: .LBB16_9: @ %cond.store ; CHECK-BE-NEXT: vstr.16 s4, [sp, #28] @@ -865,7 +865,7 @@ ; CHECK-BE-NEXT: vstr.16 s0, [sp] ; CHECK-BE-NEXT: ldrh.w r1, [sp] ; CHECK-BE-NEXT: strh r1, [r0, #14] -; CHECK-BE-NEXT: add sp, #40 +; CHECK-BE-NEXT: add sp, #36 ; CHECK-BE-NEXT: bx lr entry: %c = icmp ugt <8 x i16> %b, zeroinitializer diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll --- a/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-bitcast.ll @@ -59,8 +59,8 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_to_v8i1(i8 %b, <8 x i16> %a) { ; CHECK-LE-LABEL: bitcast_to_v8i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #8 -; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: uxtb r0, r0 ; CHECK-LE-NEXT: vmov.i8 q1, #0x0 ; CHECK-LE-NEXT: vmov.i8 q2, #0xff @@ -85,13 +85,13 @@ ; CHECK-LE-NEXT: vcmp.i16 ne, q1, zr ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 -; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_to_v8i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #8 -; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: uxtb r0, r0 ; CHECK-BE-NEXT: vmov.i8 q1, #0x0 ; CHECK-BE-NEXT: vmov.i8 q2, #0xff @@ -119,7 +119,7 @@ ; CHECK-BE-NEXT: vrev32.16 q0, q0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.16 q0, q1 -; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = bitcast i8 %b to <8 x i1> @@ -130,44 +130,28 @@ define arm_aapcs_vfpcc <16 x i8> @bitcast_to_v16i1(i16 %b, <16 x i8> %a) { ; CHECK-LE-LABEL: bitcast_to_v16i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r6, r7, lr} -; CHECK-LE-NEXT: push {r4, r6, r7, lr} -; CHECK-LE-NEXT: .setfp r7, sp, #8 -; CHECK-LE-NEXT: add r7, sp, #8 -; CHECK-LE-NEXT: .pad #16 -; CHECK-LE-NEXT: sub sp, #16 -; CHECK-LE-NEXT: mov r4, sp -; CHECK-LE-NEXT: bfc r4, #0, #4 -; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: uxth r0, r0 -; CHECK-LE-NEXT: sub.w r4, r7, #8 ; CHECK-LE-NEXT: vmov.i32 q1, #0x0 ; CHECK-LE-NEXT: vmsr p0, r0 ; CHECK-LE-NEXT: vpsel q0, q0, q1 -; CHECK-LE-NEXT: mov sp, r4 -; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_to_v16i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r6, r7, lr} -; CHECK-BE-NEXT: push {r4, r6, r7, lr} -; CHECK-BE-NEXT: .setfp r7, sp, #8 -; CHECK-BE-NEXT: add r7, sp, #8 -; CHECK-BE-NEXT: .pad #16 -; CHECK-BE-NEXT: sub sp, #16 -; CHECK-BE-NEXT: mov r4, sp -; CHECK-BE-NEXT: bfc r4, #0, #4 -; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 ; CHECK-BE-NEXT: vmov.i32 q0, #0x0 ; CHECK-BE-NEXT: uxth r0, r0 -; CHECK-BE-NEXT: sub.w r4, r7, #8 ; CHECK-BE-NEXT: vrev32.8 q0, q0 ; CHECK-BE-NEXT: vmsr p0, r0 ; CHECK-BE-NEXT: vpsel q1, q1, q0 ; CHECK-BE-NEXT: vrev64.8 q0, q1 -; CHECK-BE-NEXT: mov sp, r4 -; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr entry: %c = bitcast i16 %b to <16 x i1> %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> zeroinitializer @@ -270,8 +254,8 @@ define arm_aapcs_vfpcc i8 @bitcast_from_v8i1(<8 x i16> %a) { ; CHECK-LE-LABEL: bitcast_from_v8i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .pad #8 -; CHECK-LE-NEXT: sub sp, #8 +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i16 eq, q0, zr ; CHECK-LE-NEXT: vmrs r1, p0 ; CHECK-LE-NEXT: and r0, r1, #1 @@ -300,13 +284,13 @@ ; CHECK-LE-NEXT: rsbs r1, r1, #0 ; CHECK-LE-NEXT: bfi r0, r1, #7, #1 ; CHECK-LE-NEXT: uxtb r0, r0 -; CHECK-LE-NEXT: add sp, #8 +; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_from_v8i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .pad #8 -; CHECK-BE-NEXT: sub sp, #8 +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.16 q1, q0 ; CHECK-BE-NEXT: vcmp.i16 eq, q1, zr ; CHECK-BE-NEXT: vmrs r1, p0 @@ -336,7 +320,7 @@ ; CHECK-BE-NEXT: rsbs r1, r1, #0 ; CHECK-BE-NEXT: bfi r0, r1, #7, #1 ; CHECK-BE-NEXT: uxtb r0, r0 -; CHECK-BE-NEXT: add sp, #8 +; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <8 x i16> %a, zeroinitializer @@ -347,40 +331,24 @@ define arm_aapcs_vfpcc i16 @bitcast_from_v16i1(<16 x i8> %a) { ; CHECK-LE-LABEL: bitcast_from_v16i1: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r6, r7, lr} -; CHECK-LE-NEXT: push {r4, r6, r7, lr} -; CHECK-LE-NEXT: .setfp r7, sp, #8 -; CHECK-LE-NEXT: add r7, sp, #8 -; CHECK-LE-NEXT: .pad #16 -; CHECK-LE-NEXT: sub sp, #16 -; CHECK-LE-NEXT: mov r4, sp -; CHECK-LE-NEXT: bfc r4, #0, #4 -; CHECK-LE-NEXT: mov sp, r4 +; CHECK-LE-NEXT: .pad #4 +; CHECK-LE-NEXT: sub sp, #4 ; CHECK-LE-NEXT: vcmp.i8 eq, q0, zr -; CHECK-LE-NEXT: sub.w r4, r7, #8 ; CHECK-LE-NEXT: vmrs r0, p0 ; CHECK-LE-NEXT: uxth r0, r0 -; CHECK-LE-NEXT: mov sp, r4 -; CHECK-LE-NEXT: pop {r4, r6, r7, pc} +; CHECK-LE-NEXT: add sp, #4 +; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: bitcast_from_v16i1: ; CHECK-BE: @ %bb.0: @ %entry -; CHECK-BE-NEXT: .save {r4, r6, r7, lr} -; CHECK-BE-NEXT: push {r4, r6, r7, lr} -; CHECK-BE-NEXT: .setfp r7, sp, #8 -; CHECK-BE-NEXT: add r7, sp, #8 -; CHECK-BE-NEXT: .pad #16 -; CHECK-BE-NEXT: sub sp, #16 -; CHECK-BE-NEXT: mov r4, sp -; CHECK-BE-NEXT: bfc r4, #0, #4 -; CHECK-BE-NEXT: mov sp, r4 +; CHECK-BE-NEXT: .pad #4 +; CHECK-BE-NEXT: sub sp, #4 ; CHECK-BE-NEXT: vrev64.8 q1, q0 -; CHECK-BE-NEXT: sub.w r4, r7, #8 ; CHECK-BE-NEXT: vcmp.i8 eq, q1, zr ; CHECK-BE-NEXT: vmrs r0, p0 ; CHECK-BE-NEXT: uxth r0, r0 -; CHECK-BE-NEXT: mov sp, r4 -; CHECK-BE-NEXT: pop {r4, r6, r7, pc} +; CHECK-BE-NEXT: add sp, #4 +; CHECK-BE-NEXT: bx lr entry: %c = icmp eq <16 x i8> %a, zeroinitializer %b = bitcast <16 x i1> %c to i16 diff --git a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll --- a/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/llvm/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -16,7 +16,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} @@ -45,7 +46,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} @@ -598,7 +600,8 @@ ; ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -678,7 +681,8 @@ ; ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -698,7 +702,8 @@ ; ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -718,7 +723,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -741,7 +747,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} @@ -765,7 +772,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -788,7 +796,8 @@ ; ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store: ; AVX512NOTDQ: # %bb.0: -; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 +; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax +; AVX512NOTDQ-NEXT: kmovd %eax, %k0 ; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -459,13 +459,13 @@ ; ; X86-LABEL: conv1: ; X86: ## %bb.0: ## %entry -; X86-NEXT: subl $12, %esp -; X86-NEXT: .cfi_def_cfa_offset 16 +; X86-NEXT: pushl %eax +; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movb $-1, (%eax) -; X86-NEXT: movb $-2, (%esp) +; X86-NEXT: movb $-2, {{[0-9]+}}(%esp) ; X86-NEXT: movb $-2, %al -; X86-NEXT: addl $12, %esp +; X86-NEXT: popl %ecx ; X86-NEXT: retl entry: store <8 x i1> , <8 x i1>* %R @@ -2255,7 +2255,8 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) { ; KNL-LABEL: load_8i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw (%rdi), %k1 +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; @@ -2267,7 +2268,8 @@ ; ; AVX512BW-LABEL: load_8i1: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k1 +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; @@ -2327,7 +2329,8 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; KNL-LABEL: load_2i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw (%rdi), %k1 +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -2342,7 +2345,8 @@ ; ; AVX512BW-LABEL: load_2i1: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2371,7 +2375,8 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL-LABEL: load_4i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw (%rdi), %k1 +; KNL-NEXT: movzbl (%rdi), %eax +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -2386,7 +2391,8 @@ ; ; AVX512BW-LABEL: load_4i1: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -153,8 +153,10 @@ ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512F-NEXT: kmovw (%ecx), %k0 -; X86-AVX512F-NEXT: kmovw (%eax), %k1 +; X86-AVX512F-NEXT: movzbl (%ecx), %ecx +; X86-AVX512F-NEXT: kmovw %ecx, %k0 +; X86-AVX512F-NEXT: movzbl (%eax), %eax +; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: korw %k1, %k0, %k0 ; X86-AVX512F-NEXT: kmovw %k0, %eax ; X86-AVX512F-NEXT: # kill: def $al killed $al killed $eax @@ -162,8 +164,10 @@ ; ; X64-AVX512F-LABEL: select05_mem: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: kmovw (%rsi), %k0 -; X64-AVX512F-NEXT: kmovw (%rdi), %k1 +; X64-AVX512F-NEXT: movzbl (%rsi), %eax +; X64-AVX512F-NEXT: kmovw %eax, %k0 +; X64-AVX512F-NEXT: movzbl (%rdi), %eax +; X64-AVX512F-NEXT: kmovw %eax, %k1 ; X64-AVX512F-NEXT: korw %k1, %k0, %k0 ; X64-AVX512F-NEXT: kmovw %k0, %eax ; X64-AVX512F-NEXT: # kill: def $al killed $al killed $eax @@ -173,8 +177,10 @@ ; X86-AVX512BW: # %bb.0: ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512BW-NEXT: kmovw (%ecx), %k0 -; X86-AVX512BW-NEXT: kmovw (%eax), %k1 +; X86-AVX512BW-NEXT: movzbl (%ecx), %ecx +; X86-AVX512BW-NEXT: kmovd %ecx, %k0 +; X86-AVX512BW-NEXT: movzbl (%eax), %eax +; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 ; X86-AVX512BW-NEXT: kmovd %k0, %eax ; X86-AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -182,8 +188,10 @@ ; ; X64-AVX512BW-LABEL: select05_mem: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: kmovw (%rsi), %k0 -; X64-AVX512BW-NEXT: kmovw (%rdi), %k1 +; X64-AVX512BW-NEXT: movzbl (%rsi), %eax +; X64-AVX512BW-NEXT: kmovd %eax, %k0 +; X64-AVX512BW-NEXT: movzbl (%rdi), %eax +; X64-AVX512BW-NEXT: kmovd %eax, %k1 ; X64-AVX512BW-NEXT: korw %k1, %k0, %k0 ; X64-AVX512BW-NEXT: kmovd %k0, %eax ; X64-AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -220,8 +228,10 @@ ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512F-NEXT: kmovw (%ecx), %k0 -; X86-AVX512F-NEXT: kmovw (%eax), %k1 +; X86-AVX512F-NEXT: movzbl (%ecx), %ecx +; X86-AVX512F-NEXT: kmovw %ecx, %k0 +; X86-AVX512F-NEXT: movzbl (%eax), %eax +; X86-AVX512F-NEXT: kmovw %eax, %k1 ; X86-AVX512F-NEXT: kandw %k1, %k0, %k0 ; X86-AVX512F-NEXT: kmovw %k0, %eax ; X86-AVX512F-NEXT: # kill: def $al killed $al killed $eax @@ -229,8 +239,10 @@ ; ; X64-AVX512F-LABEL: select06_mem: ; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: kmovw (%rsi), %k0 -; X64-AVX512F-NEXT: kmovw (%rdi), %k1 +; X64-AVX512F-NEXT: movzbl (%rsi), %eax +; X64-AVX512F-NEXT: kmovw %eax, %k0 +; X64-AVX512F-NEXT: movzbl (%rdi), %eax +; X64-AVX512F-NEXT: kmovw %eax, %k1 ; X64-AVX512F-NEXT: kandw %k1, %k0, %k0 ; X64-AVX512F-NEXT: kmovw %k0, %eax ; X64-AVX512F-NEXT: # kill: def $al killed $al killed $eax @@ -240,8 +252,10 @@ ; X86-AVX512BW: # %bb.0: ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512BW-NEXT: kmovw (%ecx), %k0 -; X86-AVX512BW-NEXT: kmovw (%eax), %k1 +; X86-AVX512BW-NEXT: movzbl (%ecx), %ecx +; X86-AVX512BW-NEXT: kmovd %ecx, %k0 +; X86-AVX512BW-NEXT: movzbl (%eax), %eax +; X86-AVX512BW-NEXT: kmovd %eax, %k1 ; X86-AVX512BW-NEXT: kandw %k1, %k0, %k0 ; X86-AVX512BW-NEXT: kmovd %k0, %eax ; X86-AVX512BW-NEXT: # kill: def $al killed $al killed $eax @@ -249,8 +263,10 @@ ; ; X64-AVX512BW-LABEL: select06_mem: ; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: kmovw (%rsi), %k0 -; X64-AVX512BW-NEXT: kmovw (%rdi), %k1 +; X64-AVX512BW-NEXT: movzbl (%rsi), %eax +; X64-AVX512BW-NEXT: kmovd %eax, %k0 +; X64-AVX512BW-NEXT: movzbl (%rdi), %eax +; X64-AVX512BW-NEXT: kmovd %eax, %k1 ; X64-AVX512BW-NEXT: kandw %k1, %k0, %k0 ; X64-AVX512BW-NEXT: kmovd %k0, %eax ; X64-AVX512BW-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -478,19 +478,13 @@ ; ; AVX512-LABEL: bitcast_v32i16_to_v2i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-32, %rsp -; AVX512-NEXT: subq $32, %rsp ; AVX512-NEXT: vpmovw2m %zmm0, %k0 -; AVX512-NEXT: kmovd %k0, (%rsp) -; AVX512-NEXT: vmovdqa (%rsp), %xmm0 +; AVX512-NEXT: kmovd %k0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; AVX512-NEXT: vmovd %xmm0, %ecx ; AVX512-NEXT: vpextrw $1, %xmm0, %eax ; AVX512-NEXT: addl %ecx, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <32 x i16> %a0, zeroinitializer diff --git a/llvm/test/CodeGen/X86/load-local-v3i129.ll b/llvm/test/CodeGen/X86/load-local-v3i129.ll --- a/llvm/test/CodeGen/X86/load-local-v3i129.ll +++ b/llvm/test/CodeGen/X86/load-local-v3i129.ll @@ -4,8 +4,6 @@ define void @_start() { ; CHECK-LABEL: _start: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; CHECK-NEXT: shrdq $2, %rcx, %rax @@ -16,8 +14,6 @@ ; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq $-2, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq $-1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: popq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq Entry: %y = alloca <3 x i129>, align 4 diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll --- a/llvm/test/CodeGen/X86/pr41619.ll +++ b/llvm/test/CodeGen/X86/pr41619.ll @@ -29,23 +29,11 @@ ; This used to crash with mask registers on avx512bw targets. define i32 @bar(double %blah) nounwind { -; AVX-LABEL: bar: -; AVX: ## %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $32, %rsp -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: ## kill: def $eax killed $eax killed $rax -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq -; -; AVX512-LABEL: bar: -; AVX512: ## %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: ## kill: def $eax killed $eax killed $rax -; AVX512-NEXT: retq +; CHECK-LABEL: bar: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax +; CHECK-NEXT: retq %z = bitcast double %blah to i64 %y = trunc i64 %z to i32 %a = bitcast i32 %y to <32 x i1> diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -1382,13 +1382,23 @@ ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_2i1_to_2i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_2i1_to_2i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_2i1_to_2i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; ; X32-SSE2-LABEL: load_sext_2i1_to_2i64: ; X32-SSE2: # %bb.0: # %entry @@ -1619,13 +1629,23 @@ ; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_4i1_to_4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_4i1_to_4i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_4i1_to_4i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq ; ; X32-SSE2-LABEL: load_sext_4i1_to_4i32: ; X32-SSE2: # %bb.0: # %entry @@ -1882,12 +1902,21 @@ ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_4i1_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_4i1_to_4i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_4i1_to_4i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq ; ; X32-SSE2-LABEL: load_sext_4i1_to_4i64: ; X32-SSE2: # %bb.0: # %entry @@ -2105,7 +2134,8 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; SSE-LABEL: load_sext_8i1_to_8i16: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] @@ -2115,7 +2145,8 @@ ; ; AVX1-LABEL: load_sext_8i1_to_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] @@ -2133,7 +2164,8 @@ ; ; AVX512F-LABEL: load_sext_8i1_to_8i16: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: kmovw (%rdi), %k1 +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -2142,7 +2174,8 @@ ; ; AVX512BW-LABEL: load_sext_8i1_to_8i16: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: kmovw (%rdi), %k0 +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2151,7 +2184,8 @@ ; X32-SSE-LABEL: load_sext_8i1_to_8i16: ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movzbl (%eax), %eax +; X32-SSE-NEXT: movd %eax, %xmm0 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] @@ -2316,7 +2350,8 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; SSE-LABEL: load_sext_8i1_to_8i32: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movzbl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -2329,8 +2364,9 @@ ; ; AVX1-LABEL: load_sext_8i1_to_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -2351,17 +2387,27 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: load_sext_8i1_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: load_sext_8i1_to_8i32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: movzbl (%rdi), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: load_sext_8i1_to_8i32: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: movzbl (%rdi), %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512BW-NEXT: retq ; ; X32-SSE-LABEL: load_sext_8i1_to_8i32: ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movzbl (%eax), %eax +; X32-SSE-NEXT: movd %eax, %xmm0 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] ; X32-SSE-NEXT: movdqa %xmm1, %xmm0 @@ -2447,7 +2493,8 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; SSE2-LABEL: load_sext_16i1_to_16i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -2458,7 +2505,8 @@ ; ; SSSE3-LABEL: load_sext_16i1_to_16i8: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movzwl (%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSSE3-NEXT: pand %xmm1, %xmm0 @@ -2467,7 +2515,8 @@ ; ; SSE41-LABEL: load_sext_16i1_to_16i8: ; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movzwl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; SSE41-NEXT: pand %xmm1, %xmm0 @@ -2476,7 +2525,8 @@ ; ; AVX1-LABEL: load_sext_16i1_to_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] ; AVX1-NEXT: # xmm1 = mem[0,0] @@ -2486,7 +2536,8 @@ ; ; AVX2-LABEL: load_sext_16i1_to_16i8: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -2512,7 +2563,8 @@ ; X32-SSE2-LABEL: load_sext_16i1_to_16i8: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movzwl (%eax), %eax +; X32-SSE2-NEXT: movd %eax, %xmm0 ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] ; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] @@ -2524,7 +2576,8 @@ ; X32-SSE41-LABEL: load_sext_16i1_to_16i8: ; X32-SSE41: # %bb.0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE41-NEXT: movzwl (%eax), %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 ; X32-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; X32-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; X32-SSE41-NEXT: pand %xmm1, %xmm0 @@ -2539,7 +2592,8 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; SSE-LABEL: load_sext_16i1_to_16i16: ; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] @@ -2553,7 +2607,8 @@ ; ; AVX1-LABEL: load_sext_16i1_to_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -2593,7 +2648,8 @@ ; X32-SSE-LABEL: load_sext_16i1_to_16i16: ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movzwl (%eax), %eax +; X32-SSE-NEXT: movd %eax, %xmm0 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll @@ -81,8 +81,7 @@ ; sub-byte element size but byte size ; CHECK-LABEL: @merge_store_2_constants_v4i2( -; CHECK: store <4 x i2> -; CHECK: store <4 x i2> +; CHECK: store <8 x i2> {{.*}}, align 1 define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 { %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1 store <4 x i2> , <4 x i2> addrspace(1)* %out.gep.1 diff --git a/llvm/test/Transforms/SROA/vector-promotion-different-size.ll b/llvm/test/Transforms/SROA/vector-promotion-different-size.ll --- a/llvm/test/Transforms/SROA/vector-promotion-different-size.ll +++ b/llvm/test/Transforms/SROA/vector-promotion-different-size.ll @@ -3,7 +3,7 @@ define <4 x i1> @vector_bitcast() { ; CHECK-LABEL: @vector_bitcast - ; CHECK: alloca i1 + ; CHECK: alloca <3 x i1> %a = alloca <3 x i1> store <3 x i1> , <3 x i1>* %a