S390: Optimize __memcpy_z196.

This patch introduces an extra loop without pfd instructions
as it turned out that the pfd instructions are usefull
for copies >=64KB but are counterproductive for smaller copies.
This commit is contained in:
Stefan Liebler 2020-06-26 09:45:11 +02:00
parent 2034c70e64
commit 0792c8ae1a
1 changed files with 15 additions and 6 deletions

View File

@ -184,25 +184,34 @@ ENTRY(MEMCPY_Z196)
je .L_Z196_4 je .L_Z196_4
.L_Z196_start2: .L_Z196_start2:
aghi %r4,-1 aghi %r4,-1
srlg %r5,%r4,8 risbg %r5,%r4,8,128+63,56 # r0 = r5 / 256
ltgr %r5,%r5
jne .L_Z196_5 jne .L_Z196_5
.L_Z196_3: .L_Z196_3:
exrl %r4,.L_Z196_14 exrl %r4,.L_Z196_14
.L_Z196_4: .L_Z196_4:
br %r14 br %r14
.L_Z196_5: .L_Z196_5:
cgfi %r5,262144 # Switch to mvcle for copies >64MB cgfi %r5,255 # Switch to loop with pfd for copies >=64kB
jh __memcpy_mvcle jh .L_Z196_6
.L_Z196_2: .L_Z196_2:
pfd 1,768(%r3)
pfd 2,768(%r1)
mvc 0(256,%r1),0(%r3) mvc 0(256,%r1),0(%r3)
aghi %r5,-1 aghi %r5,-1
la %r1,256(%r1) la %r1,256(%r1)
la %r3,256(%r3) la %r3,256(%r3)
jne .L_Z196_2 jne .L_Z196_2
j .L_Z196_3 j .L_Z196_3
.L_Z196_6:
cgfi %r5,262144 # Switch to mvcle for copies >64MB
jh __memcpy_mvcle
.L_Z196_7:
pfd 1,1024(%r3)
pfd 2,1024(%r1)
mvc 0(256,%r1),0(%r3)
aghi %r5,-1
la %r1,256(%r1)
la %r3,256(%r3)
jne .L_Z196_7
j .L_Z196_3
.L_Z196_14: .L_Z196_14:
mvc 0(1,%r1),0(%r3) mvc 0(1,%r1),0(%r3)
END(MEMCPY_Z196) END(MEMCPY_Z196)