Deljenje je citavih 25 takta zato sto na Haswell-u i ranije to se splituje u dve operacije.
Code:
~/.../examples/assembler >>> cat latency.asm
; latency test
format elf64
public recip
public recip1
public recip2
public recip3
public _rdtsc
section '.text' executable
N = 1000000
recip:
recip1:
; Load constants and input
vbroadcastsd ymm1, [one]
vpbroadcastq ymm4, [magic]
mov eax, N
.loop:
vmovdqu ymm0, [rdi]
vpsubq ymm2, ymm4, ymm0
vfnmadd213pd ymm0, ymm2, ymm1
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm0, ymm2, ymm2
dec eax
jnz .loop
vmovups [rdi], ymm0
ret
recip2:
; Load constants and input
vbroadcastsd ymm1, [one]
mov eax, N
.loop:
vmovdqu ymm0, [rdi]
vcvtpd2ps xmm2,ymm0
vrcpps xmm2,xmm2
vcvtps2pd ymm2,xmm2
vfnmadd213pd ymm0, ymm2, ymm1
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm0, ymm2, ymm2
dec eax
jnz .loop
vmovups [rdi], ymm0
ret
recip3:
; Load constants and input
vbroadcastsd ymm1, [one]
mov eax, N
.loop:
vmovdqu ymm0, [rdi]
vdivpd ymm0,ymm1,ymm0
dec eax
jnz .loop
vmovups [rdi], ymm0
ret
_rdtsc:
rdtscp
shl rdx, 32
or rax, rdx
ret
section '.data' writeable align 16
align 16
one dq 3FF0000000000000h
magic dq 7FDE6238502484BAh
~/.../examples/assembler >>> cat latency.asm
; latency test
format elf64
public recip
public recip1
public recip2
public recip3
public _rdtsc
section '.text' executable
N = 1000000
recip:
recip1:
; Load constants and input
vbroadcastsd ymm1, [one]
vpbroadcastq ymm4, [magic]
mov eax, N
.loop:
vmovdqu ymm0, [rdi]
vpsubq ymm2, ymm4, ymm0
vfnmadd213pd ymm0, ymm2, ymm1
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm0, ymm2, ymm2
dec eax
jnz .loop
vmovups [rdi], ymm0
ret
recip2:
; Load constants and input
vbroadcastsd ymm1, [one]
mov eax, N
.loop:
vmovdqu ymm0, [rdi]
vcvtpd2ps xmm2,ymm0
vrcpps xmm2,xmm2
vcvtps2pd ymm2,xmm2
vfnmadd213pd ymm0, ymm2, ymm1
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm2, ymm2, ymm0
vmulpd ymm0, ymm0, ymm0
vfmadd132pd ymm0, ymm2, ymm2
dec eax
jnz .loop
vmovups [rdi], ymm0
ret
recip3:
; Load constants and input
vbroadcastsd ymm1, [one]
mov eax, N
.loop:
vmovdqu ymm0, [rdi]
vdivpd ymm0,ymm1,ymm0
dec eax
jnz .loop
vmovups [rdi], ymm0
ret
_rdtsc:
rdtscp
shl rdx, 32
or rax, rdx
ret
section '.data' writeable align 16
align 16
one dq 3FF0000000000000h
magic dq 7FDE6238502484BAh
Code:
~/.../examples/assembler >>> fasm latency.asm latencya.o
flat assembler version 1.72 (16384 kilobytes memory)
3 passes, 1024 bytes.
~/.../examples/assembler >>> fasm latency.asm latencya.o
flat assembler version 1.72 (16384 kilobytes memory)
3 passes, 1024 bytes.
program koji koristi ove rutine je uzet primer u nimu
Code:
~/.../examples/assembler >>> cat latency.nim
import strfmt,random
randomize()
{.link:"latencya.o".}
proc recip1(x:ptr float64){.importc,cdecl.}
proc recip2(x:ptr float64){.importc,cdecl.}
proc recip3(x:ptr float64){.importc,cdecl.}
proc rdtsc():uint64 =
# we have to use emit here, nim does not have volatile quanitifier for asm statement
{.emit:
"""asm volatile(
".intel_syntax noprefix\n"
"rdtscp\n"
"shl rdx,32\n"
"or rax,rdx\n"
".att_syntax\n"
:"=a"(`result`)
:
:"rdx");
""".}
var x,y : array[4,float64]
for i in x.mitems :
i = random(1000.0)
proc f[F](ff:F,title:string) =
y = x
echo title
var t0,t1,t2:array[11,uint64]
for i in 0..10 :
t0[i] = rdtsc()
t1[i] = rdtsc()
ff(addr y[0])
t2[i] = rdtsc()
for i in 0..3 :
echo "{0:24.18f} {1:24.18f} {2:24.18f}".fmt(x[i],y[i],1/x[i])
for i in 0..10 :
echo "{0:f}\t{1:f}".fmt(float64(t1[i]-t0[i]),float64(t2[i]-t1[i])/1000000.0)
f(recip1,"recip1")
f(recip2,"recip2")
f(recip3,"recip3")
~/.../examples/assembler >>> cat latency.nim
import strfmt,random
randomize()
{.link:"latencya.o".}
proc recip1(x:ptr float64){.importc,cdecl.}
proc recip2(x:ptr float64){.importc,cdecl.}
proc recip3(x:ptr float64){.importc,cdecl.}
proc rdtsc():uint64 =
# we have to use emit here, nim does not have volatile quanitifier for asm statement
{.emit:
"""asm volatile(
".intel_syntax noprefix\n"
"rdtscp\n"
"shl rdx,32\n"
"or rax,rdx\n"
".att_syntax\n"
:"=a"(`result`)
:
:"rdx");
""".}
var x,y : array[4,float64]
for i in x.mitems :
i = random(1000.0)
proc f[F](ff:F,title:string) =
y = x
echo title
var t0,t1,t2:array[11,uint64]
for i in 0..10 :
t0[i] = rdtsc()
t1[i] = rdtsc()
ff(addr y[0])
t2[i] = rdtsc()
for i in 0..3 :
echo "{0:24.18f} {1:24.18f} {2:24.18f}".fmt(x[i],y[i],1/x[i])
for i in 0..10 :
echo "{0:f}\t{1:f}".fmt(float64(t1[i]-t0[i]),float64(t2[i]-t1[i])/1000000.0)
f(recip1,"recip1")
f(recip2,"recip2")
f(recip3,"recip3")
Code:
~/.../examples/assembler >>> nim c -d:release latency.nim
Hint: used config file '/home/bmaxa/projects/Nim/config/nim.cfg' [Conf]
Hint: system [Processing]
Hint: latency [Processing]
Hint: strfmt [Processing]
Hint: macros [Processing]
Hint: strutils [Processing]
Hint: parseutils [Processing]
Hint: math [Processing]
Hint: algorithm [Processing]
Hint: unicode [Processing]
Hint: streams [Processing]
Hint: random [Processing]
Hint: times [Processing]
Hint: posix [Processing]
latency.nim(24, 7) Warning: random is deprecated [Deprecated]
Hint: [Link]
Hint: operation successful (25498 lines compiled; 0.409 sec total; 54.746MiB peakmem; Release Build) [SuccessX]
~/.../examples/assembler >>> nim c -d:release latency.nim
Hint: used config file '/home/bmaxa/projects/Nim/config/nim.cfg' [Conf]
Hint: system [Processing]
Hint: latency [Processing]
Hint: strfmt [Processing]
Hint: macros [Processing]
Hint: strutils [Processing]
Hint: parseutils [Processing]
Hint: math [Processing]
Hint: algorithm [Processing]
Hint: unicode [Processing]
Hint: streams [Processing]
Hint: random [Processing]
Hint: times [Processing]
Hint: posix [Processing]
latency.nim(24, 7) Warning: random is deprecated [Deprecated]
Hint: [Link]
Hint: operation successful (25498 lines compiled; 0.409 sec total; 54.746MiB peakmem; Release Build) [SuccessX]
I evo kako kod mene izgleda output:
Code:
~/.../examples/assembler >>> ./latency
recip1
504.966575886583314048 0.001980329090582430 0.001980329090582427
212.648786567563917056 0.004702589730895429 0.004702589730895429
701.523519828456414848 0.001425468956827748 0.001425468956827748
676.426810034068466880 0.001478356542298545 0.001478356542298544
39.000000 4.449594
21.000000 4.079955
24.000000 4.080168
24.000000 4.109700
24.000000 4.225911
24.000000 4.080264
48.000000 4.096860
21.000000 4.525557
57.000000 5.296398
24.000000 4.464465
39.000000 4.292193
recip2
504.966575886583314048 0.001980329090582427 0.001980329090582427
212.648786567563917056 0.004702589730895429 0.004702589730895429
701.523519828456414848 0.001425468956827748 0.001425468956827748
676.426810034068466880 0.001478356542298544 0.001478356542298544
24.000000 8.693367
21.000000 8.772282
24.000000 8.470779
24.000000 8.529483
24.000000 9.390477
27.000000 10.128975
24.000000 8.951805
21.000000 8.731911
24.000000 8.738061
48.000000 8.615808
24.000000 8.454282
recip3
504.966575886583314048 0.001980329090582427 0.001980329090582427
212.648786567563917056 0.004702589730895429 0.004702589730895429
701.523519828456414848 0.001425468956827748 0.001425468956827748
676.426810034068466880 0.001478356542298544 0.001478356542298544
24.000000 26.516313
21.000000 26.567454
21.000000 26.639685
39.000000 26.519364
24.000000 26.308188
24.000000 26.426541
24.000000 26.043252
24.000000 26.151756
24.000000 26.226312
24.000000 26.032281
51.000000 26.181567
~/.../examples/assembler >>> ./latency
recip1
504.966575886583314048 0.001980329090582430 0.001980329090582427
212.648786567563917056 0.004702589730895429 0.004702589730895429
701.523519828456414848 0.001425468956827748 0.001425468956827748
676.426810034068466880 0.001478356542298545 0.001478356542298544
39.000000 4.449594
21.000000 4.079955
24.000000 4.080168
24.000000 4.109700
24.000000 4.225911
24.000000 4.080264
48.000000 4.096860
21.000000 4.525557
57.000000 5.296398
24.000000 4.464465
39.000000 4.292193
recip2
504.966575886583314048 0.001980329090582427 0.001980329090582427
212.648786567563917056 0.004702589730895429 0.004702589730895429
701.523519828456414848 0.001425468956827748 0.001425468956827748
676.426810034068466880 0.001478356542298544 0.001478356542298544
24.000000 8.693367
21.000000 8.772282
24.000000 8.470779
24.000000 8.529483
24.000000 9.390477
27.000000 10.128975
24.000000 8.951805
21.000000 8.731911
24.000000 8.738061
48.000000 8.615808
24.000000 8.454282
recip3
504.966575886583314048 0.001980329090582427 0.001980329090582427
212.648786567563917056 0.004702589730895429 0.004702589730895429
701.523519828456414848 0.001425468956827748 0.001425468956827748
676.426810034068466880 0.001478356542298544 0.001478356542298544
24.000000 26.516313
21.000000 26.567454
21.000000 26.639685
39.000000 26.519364
24.000000 26.308188
24.000000 26.426541
24.000000 26.043252
24.000000 26.151756
24.000000 26.226312
24.000000 26.032281
51.000000 26.181567