-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathggml.go
224 lines (203 loc) · 7.93 KB
/
ggml.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
package gguf_parser
import (
"errors"
"fmt"
"slices"
)
// Types for GGMLType.
type (
// GGMLType is a type of GGML tensor,
// see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/include/ggml.h#L363-L401.
GGMLType uint32
// GGMLTypeTrait holds the trait of a GGMLType,
// see https://github.com/ggerganov/llama.cpp/blob/b34e02348064c2f0cef1f89b44d9bee4eb15b9e7/ggml/src/ggml.c#L663-L1082.
GGMLTypeTrait struct {
BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64.
TypeSize uint64 // Original is uint32, in order to reduce conversion, here we use uint64.
Quantized bool
}
)
// GGMLType constants.
//
// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.
const (
GGMLTypeF32 GGMLType = iota
GGMLTypeF16
GGMLTypeQ4_0
GGMLTypeQ4_1
GGMLTypeQ4_2
GGMLTypeQ4_3
GGMLTypeQ5_0
GGMLTypeQ5_1
GGMLTypeQ8_0
GGMLTypeQ8_1
GGMLTypeQ2_K
GGMLTypeQ3_K
GGMLTypeQ4_K
GGMLTypeQ5_K
GGMLTypeQ6_K
GGMLTypeQ8_K
GGMLTypeIQ2_XXS
GGMLTypeIQ2_XS
GGMLTypeIQ3_XXS
GGMLTypeIQ1_S
GGMLTypeIQ4_NL
GGMLTypeIQ3_S
GGMLTypeIQ2_S
GGMLTypeIQ4_XS
GGMLTypeI8
GGMLTypeI16
GGMLTypeI32
GGMLTypeI64
GGMLTypeF64
GGMLTypeIQ1_M
GGMLTypeBF16
GGMLTypeQ4_0_4_4
GGMLTypeQ4_0_4_8
GGMLTypeQ4_0_8_8
GGMLTypeTQ1_0
GGMLTypeTQ2_0
GGMLTypeIQ4_NL_4_4
GGMLTypeIQ4_NL_4_8
GGMLTypeIQ4_NL_8_8
_GGMLTypeCount // Unknown
)
// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType.
var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{
GGMLTypeF32: {BlockSize: 1, TypeSize: 4},
GGMLTypeF16: {BlockSize: 1, TypeSize: 2},
GGMLTypeQ4_0: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeQ4_1: {BlockSize: 32, TypeSize: 20, Quantized: true},
GGMLTypeQ4_2: {BlockSize: 0, TypeSize: 0}, // Deprecated
GGMLTypeQ4_3: {BlockSize: 0, TypeSize: 0}, // Deprecated
GGMLTypeQ5_0: {BlockSize: 32, TypeSize: 22, Quantized: true},
GGMLTypeQ5_1: {BlockSize: 32, TypeSize: 24, Quantized: true},
GGMLTypeQ8_0: {BlockSize: 32, TypeSize: 34, Quantized: true},
GGMLTypeQ8_1: {BlockSize: 32, TypeSize: 36, Quantized: true},
GGMLTypeQ2_K: {BlockSize: 256, TypeSize: 84, Quantized: true},
GGMLTypeQ3_K: {BlockSize: 256, TypeSize: 110, Quantized: true},
GGMLTypeQ4_K: {BlockSize: 256, TypeSize: 144, Quantized: true},
GGMLTypeQ5_K: {BlockSize: 256, TypeSize: 176, Quantized: true},
GGMLTypeQ6_K: {BlockSize: 256, TypeSize: 210, Quantized: true},
GGMLTypeQ8_K: {BlockSize: 256, TypeSize: 292, Quantized: true},
GGMLTypeIQ2_XXS: {BlockSize: 256, TypeSize: 66, Quantized: true},
GGMLTypeIQ2_XS: {BlockSize: 256, TypeSize: 74, Quantized: true},
GGMLTypeIQ3_XXS: {BlockSize: 256, TypeSize: 98, Quantized: true},
GGMLTypeIQ1_S: {BlockSize: 256, TypeSize: 50, Quantized: true},
GGMLTypeIQ4_NL: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeIQ3_S: {BlockSize: 256, TypeSize: 110, Quantized: true},
GGMLTypeIQ2_S: {BlockSize: 256, TypeSize: 82, Quantized: true},
GGMLTypeIQ4_XS: {BlockSize: 256, TypeSize: 136, Quantized: true},
GGMLTypeI8: {BlockSize: 1, TypeSize: 1},
GGMLTypeI16: {BlockSize: 1, TypeSize: 2},
GGMLTypeI32: {BlockSize: 1, TypeSize: 4},
GGMLTypeI64: {BlockSize: 1, TypeSize: 8},
GGMLTypeF64: {BlockSize: 1, TypeSize: 8},
GGMLTypeIQ1_M: {BlockSize: 256, TypeSize: 56, Quantized: true},
GGMLTypeBF16: {BlockSize: 1, TypeSize: 2},
GGMLTypeQ4_0_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeQ4_0_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeQ4_0_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeTQ1_0: {BlockSize: 256, TypeSize: 54, Quantized: true},
GGMLTypeTQ2_0: {BlockSize: 256, TypeSize: 66, Quantized: true},
GGMLTypeIQ4_NL_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeIQ4_NL_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
GGMLTypeIQ4_NL_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
}
// Trait returns the GGMLTypeTrait of the GGMLType.
func (t GGMLType) Trait() (GGMLTypeTrait, bool) {
tt, ok := _GGMLTypeTraits[t]
return tt, ok
}
// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait,
// which is inspired by
// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145.
//
// The index of the given dimensions means the number of dimension,
// i.e. 0 is the first dimension, 1 is the second dimension, and so on.
//
// The value of the item is the number of elements in the corresponding dimension.
func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 {
if len(dimensions) == 0 {
panic(errors.New("no dimensions"))
}
tt, ok := t.Trait()
if !ok {
panic(fmt.Errorf("invalid type: %v", t))
}
// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643
ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size
for i := 1; i < len(dimensions); i++ {
ds *= dimensions[i]
}
return ds
}
// GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243.
func GGMLMemoryPadding(size uint64) uint64 {
const align = 16
return GGMLPadding(size, align)
}
// GGMLPadding returns the padded size of the given size according to given align,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255.
func GGMLPadding(size, align uint64) uint64 {
return (size + align - 1) &^ (align - 1)
}
// GGML tensor constants.
const (
// GGMLTensorSize is the size of GGML tensor in bytes,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606.
GGMLTensorSize = 368
// GGMLObjectSize is the size of GGML object in bytes,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563.
GGMLObjectSize = 32
)
// GGMLTensorOverhead is the overhead of GGML tensor in bytes,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767.
func GGMLTensorOverhead() uint64 {
return GGMLObjectSize + GGMLTensorSize
}
// GGML computation graph constants.
const (
// GGMLComputationGraphSize is the size of GGML computation graph in bytes.
GGMLComputationGraphSize = 80
// GGMLComputationGraphNodesMaximum is the maximum nodes of the computation graph,
// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L103.
GGMLComputationGraphNodesMaximum = 8192
// GGMLComputationGraphNodesDefault is the default nodes of the computation graph,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L237.
GGMLComputationGraphNodesDefault = 2048
)
// GGMLComputationGraphOverhead is the overhead of GGML graph in bytes,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L18905-L18917.
func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 {
const pointerSize = 8
var g uint64 = GGMLComputationGraphSize
g += pointerSize * nodes * 2
if grads {
g += pointerSize * nodes
}
g += pointerSize * GGMLHashSize(nodes)
return GGMLObjectSize + GGMLMemoryPadding(g)
}
// GGMLHashSize returns the size of the hash table for the given base,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722.
func GGMLHashSize(base uint64) uint64 {
primes := []uint64{
2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
2053, 4099, 8209, 16411, 32771, 65537, 131101,
262147, 524309, 1048583, 2097169, 4194319, 8388617,
16777259, 33554467, 67108879, 134217757, 268435459,
536870923, 1073741827, 2147483659,
}
i, ok := slices.BinarySearchFunc(primes, base, func(e, t uint64) int {
if t >= e {
return 0
}
return -1
})
if !ok {
return base | 1
}
return primes[i]
}