x86: Only align destination to 1x VEC_SIZE in memset 4x loop

Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on
performance other than potentially resulting in an additional
iteration of the loop.
1x maintains aligned stores (the only reason to align in this case)
and doesn't incur any unnecessary loop iterations.
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>

(cherry picked from commit 9469261cf1)
This commit is contained in:
Noah Goldstein 2023-11-01 15:30:26 -05:00 committed by Sunil K Pandey
parent 7772f9358c
commit 5a64f93365
1 changed files with 1 additions and 1 deletions

View File

@ -293,7 +293,7 @@ L(more_2x_vec):
leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
#endif
/* Align dst for loop. */
andq $(VEC_SIZE * -2), %LOOP_REG
andq $(VEC_SIZE * -1), %LOOP_REG
.p2align 4
L(loop):
VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG)