-
Notifications
You must be signed in to change notification settings - Fork 4
/
asm_vecInvSqrt_avx.s
82 lines (64 loc) · 1.99 KB
/
asm_vecInvSqrt_avx.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// +build avx
// +build amd64
// +build !fastmath
/*
InvSqrt is a function that inverse square roots (1/√x) each element in a []float64
Because of the way VBROADCASTSD works, we first backup the first element of the slice
into a register, BX. Meanwhile, we replace the first element with a constant 1.0.
This is done so that we can broadcast the constant into the Y1 register. After 1.0 has been
broadcasted into Y1, we move the value back into the top of the slice.
The following is then performed:
Y0 = Sqrt(a[i:i+4])
Y0 = Y1/Y0
And the standard looping thing happens
*/
#include "textflag.h"
#define one 0x3ff0000000000000
// func InvSqrt(a []float64)
TEXT ·InvSqrt(SB), NOSPLIT, $0
MOVQ a_data+0(FP), SI
MOVQ SI, CX
MOVQ a_len+8(FP), AX // len(a) into AX - +8, because first 8 is pointer, second 8 is length, third 8 is cap
// make sure that len(a) >= 1
XORQ BX, BX
CMPQ BX, AX
JGE done
MOVQ $one, DX
SUBQ $4, AX
JL remainder
// store the first element in BX
// This is done so that we can move 1.0 into the first element of the slice
// because AVX instruction vbroadcastsd can only read from memory location not from registers
MOVQ (SI), BX
// load 1.0 into the first element
MOVQ DX, (SI)
// VBROADCASTSD (SI), Y1
BYTE $0xc4; BYTE $0xe2; BYTE $0x7d; BYTE $0x19; BYTE $0x0e // vbroadcastsd (%rbx),%ymm1
// now that we're done with the ghastly business of trying to broadcast 1.0 without using any extra memory...
// we restore the first element
MOVQ BX, (SI)
loop:
// a[0] to a[3]
// VSQRTPD (SI), Y0
// VDIVPD Y0, Y1, Y0
// VMOVUPD Y0, (SI)
BYTE $0xc5; BYTE $0xfd; BYTE $0x51; BYTE $0x06 // vsqrtpd (%rsi),%ymm0
BYTE $0xc5; BYTE $0xf5; BYTE $0x5e; BYTE $0xc0 // vdivpd %ymm0, %ymm1, %ymm0
BYTE $0xc5; BYTE $0xfd; BYTE $0x11; BYTE $0x06 // vmovupd %ymm0,(%rsi)
ADDQ $32, SI
SUBQ $4, AX
JGE loop
remainder:
ADDQ $4, AX
JE done
remainder1:
MOVQ DX, X1
MOVSD (SI), X0
SQRTSD X0, X0
DIVSD X0, X1
MOVSD X1, (SI)
ADDQ $8, SI
DECQ AX
JNE remainder1
done:
RET