Skip to content

Commit c7de8c0

Browse files
authored
Add pointer to Microsoft.ML.Tokenizers (#37)
1 parent 086544d commit c7de8c0

File tree

3 files changed

+59
-21
lines changed

3 files changed

+59
-21
lines changed

README.md

+41-20
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ SharpToken is a C# library that serves as a port of the Python [tiktoken](https:
1414
It provides functionality for encoding and decoding tokens using GPT-based encodings. This library is built for .NET 6, .NET 8
1515
and .NET Standard 2.0, making it compatible with a wide range of frameworks.
1616

17+
> [!Important]
18+
> The functionality in `SharpToken` has been added to [`Microsoft.ML.Tokenizers`](https://www.nuget.org/packages/Microsoft.ML.Tokenizers). `Microsoft.ML.Tokenizers` is a tokenizer library being developed by the .NET team and going forward, the central place for tokenizer development in .NET. By using `Microsoft.ML.Tokenizers`, you should see improved performance over existing tokenizer library implementations, including `SharpToken`. A stable release of `Microsoft.ML.Tokenizers` is expected alongside the .NET 9.0 release (November 2024). Instructions for migration can be found at https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md.
19+
1720
## Installation
1821

1922
To install SharpToken, use the NuGet package manager:
@@ -200,6 +203,7 @@ public class CompareBenchmark
200203
private GptEncoding _sharpToken;
201204
private TikToken _tikToken;
202205
private ITokenizer _tokenizer;
206+
private Tokenizer _mlTokenizer;
203207
private string _kLongText;
204208

205209
[GlobalSetup]
@@ -252,35 +256,52 @@ public class CompareBenchmark
252256

253257
return sum;
254258
}
259+
260+
[Benchmark]
261+
public int MLTokenizers()
262+
{
263+
var sum = 0;
264+
for (var i = 0; i < 10000; i++)
265+
{
266+
var encoded = _mlTokenizer.EncodeToIds(_kLongText);
267+
var decoded = _mlTokenizer.Decode(encoded);
268+
sum += decoded.Length;
269+
}
270+
271+
return sum;
272+
}
255273
}
256274
```
257275

258276
</details>
259277

260278
```
261-
BenchmarkDotNet v0.13.12, Windows 11 (10.0.22631.3296/23H2/2023Update/SunValley3)
262-
AMD Ryzen 9 3900X, 1 CPU, 24 logical and 12 physical cores
263-
.NET SDK 8.0.200
264-
[Host] : .NET 8.0.2 (8.0.224.6711), X64 RyuJIT AVX2
265-
.NET 6.0 : .NET 6.0.16 (6.0.1623.17311), X64 RyuJIT AVX2
266-
.NET 8.0 : .NET 8.0.2 (8.0.224.6711), X64 RyuJIT AVX2
279+
BenchmarkDotNet v0.13.9+228a464e8be6c580ad9408e98f18813f6407fb5a, Windows 11 (10.0.22631.3296)
280+
11th Gen Intel Core i9-11950H 2.60GHz, 1 CPU, 16 logical and 8 physical cores
281+
.NET SDK 9.0.100-preview.2.24157.14
282+
[Host] : .NET 8.0.3 (8.0.324.11423), X64 RyuJIT AVX2
283+
.NET 6.0 : .NET 6.0.28 (6.0.2824.12007), X64 RyuJIT AVX2
284+
.NET 8.0 : .NET 8.0.3 (8.0.324.11423), X64 RyuJIT AVX2
267285
.NET Framework 4.7.1 : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT VectorSize=256
268286
```
269287

270-
| Method | Job | Runtime | Mean | Error | StdDev | Gen0 | Gen1 | Allocated |
271-
|--------------- |--------------------- |--------------------- |---------:|---------:|---------:|-----------:|----------:|----------:|
272-
| **SharpToken** | .NET 8.0 | .NET 8.0 | 100.4 ms | 1.95 ms | 1.91 ms | 2000.0000 | - | 22.13 MB |
273-
| **SharpToken** | .NET 6.0 | .NET 6.0 | 169.9 ms | 2.42 ms | 2.15 ms | 24333.3333 | 1000.0000 | 196.3 MB |
274-
| **SharpToken** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 455.3 ms | 8.34 ms | 6.97 ms | 34000.0000 | 1000.0000 | 204.39 MB |
275-
| | | | | | | | | |
276-
| *TiktokenSharp*| .NET 8.0 | .NET 8.0 | 211.4 ms | 1.83 ms | 1.53 ms | 42000.0000 | 1000.0000 | 338.98 MB |
277-
| *TiktokenSharp*| .NET 6.0 | .NET 6.0 | 258.6 ms | 5.09 ms | 6.25 ms | 39000.0000 | 1000.0000 | 313.26 MB |
278-
| *TiktokenSharp*| .NET Framework 4.7.1 | .NET Framework 4.7.1 | 638.3 ms | 12.47 ms | 16.21 ms | 63000.0000 | 1000.0000 | 378.31 MB |
279-
| | | | | | | | | |
280-
| *TokenizerLib* | .NET 8.0 | .NET 8.0 | 124.4 ms | 1.81 ms | 1.60 ms | 27250.0000 | 1000.0000 | 217.82 MB |
281-
| *TokenizerLib* | .NET 6.0 | .NET 6.0 | 165.5 ms | 1.38 ms | 1.16 ms | 27000.0000 | 1000.0000 | 217.82 MB |
282-
| *TokenizerLib* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 499.7 ms | 9.81 ms | 14.07 ms | 40000.0000 | 1000.0000 | 243.79 MB |
283-
288+
| Method | Job | Runtime | Mean | Error | StdDev | Median | Gen0 | Gen1 | Allocated |
289+
|------------------ |--------------------- |--------------------- |----------:|---------:|----------:|----------:|-----------:|----------:|----------:|
290+
| **MLTokenizers** | .NET 8.0 | .NET 8.0 | 60.55 ms | 1.143 ms | 1.123 ms | 60.45 ms | 1000.0000 | - | 13.12 MB |
291+
| **MLTokenizers** | .NET 6.0 | .NET 6.0 | 95.75 ms | 1.374 ms | 1.147 ms | 95.54 ms | 10500.0000 | - | 126.19 MB |
292+
| **MLTokenizers** | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 291.77 ms | 5.811 ms | 11.195 ms | 291.64 ms | 21000.0000 | - | 127.33 MB |
293+
| | | | | | | | | | |
294+
| *SharpToken* | .NET 8.0 | .NET 8.0 | 87.78 ms | 1.700 ms | 1.590 ms | 87.34 ms | 1000.0000 | - | 22.13 MB |
295+
| *SharpToken* | .NET 6.0 | .NET 6.0 | 128.84 ms | 1.718 ms | 1.607 ms | 128.17 ms | 16250.0000 | 500.0000 | 196.31 MB |
296+
| *SharpToken* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 356.21 ms | 6.843 ms | 10.854 ms | 355.09 ms | 34000.0000 | 1000.0000 | 204.39 MB |
297+
| | | | | | | | | | |
298+
| *TokenizerLib* | .NET 8.0 | .NET 8.0 | 109.26 ms | 2.082 ms | 4.482 ms | 107.90 ms | 18200.0000 | 600.0000 | 217.82 MB |
299+
| *TokenizerLib* | .NET 6.0 | .NET 6.0 | 126.16 ms | 2.959 ms | 8.630 ms | 122.34 ms | 18000.0000 | 500.0000 | 217.82 MB |
300+
| *TokenizerLib* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 374.71 ms | 7.374 ms | 16.794 ms | 370.12 ms | 40000.0000 | 1000.0000 | 243.79 MB |
301+
| | | | | | | | | | |
302+
| *TiktokenSharp* | .NET 8.0 | .NET 8.0 | 177.34 ms | 3.506 ms | 8.797 ms | 174.98 ms | 28000.0000 | 1000.0000 | 338.98 MB |
303+
| *TiktokenSharp* | .NET 6.0 | .NET 6.0 | 196.17 ms | 3.912 ms | 8.422 ms | 195.52 ms | 26000.0000 | 666.6667 | 313.26 MB |
304+
| *TiktokenSharp* | .NET Framework 4.7.1 | .NET Framework 4.7.1 | 488.22 ms | 9.696 ms | 15.931 ms | 487.17 ms | 63000.0000 | 1000.0000 | 378.31 MB |
284305

285306
## Performance
286307

SharpToken.Benchmark/CompareBenchmark.cs

+17-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
using BenchmarkDotNet.Jobs;
44
using TiktokenSharp;
55
using Microsoft.DeepDev;
6-
6+
using Microsoft.ML.Tokenizers;
77

88
namespace SharpToken.Benchmark
99
{
@@ -17,6 +17,7 @@ public class CompareBenchmark
1717
private GptEncoding _sharpToken;
1818
private TikToken _tikToken;
1919
private ITokenizer _tokenizer;
20+
private Tokenizer _mlTokenizer;
2021
private string _kLongText;
2122

2223
[GlobalSetup]
@@ -25,6 +26,7 @@ public async Task Setup()
2526
_sharpToken = GptEncoding.GetEncoding("cl100k_base");
2627
_tikToken = await TikToken.GetEncodingAsync("cl100k_base").ConfigureAwait(false);
2728
_tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4").ConfigureAwait(false);
29+
_mlTokenizer = Tokenizer.CreateTiktokenForModel("gpt-4");
2830
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
2931
}
3032

@@ -69,5 +71,19 @@ public int TokenizerLib()
6971

7072
return sum;
7173
}
74+
75+
[Benchmark]
76+
public int MLTokenizers()
77+
{
78+
var sum = 0;
79+
for (var i = 0; i < 10000; i++)
80+
{
81+
var encoded = _mlTokenizer.EncodeToIds(_kLongText);
82+
var decoded = _mlTokenizer.Decode(encoded);
83+
sum += decoded.Length;
84+
}
85+
86+
return sum;
87+
}
7288
}
7389
}

SharpToken.Benchmark/SharpToken.Benchmark.csproj

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
<PackageReference Include="Microsoft.DeepDev.TokenizerLib" Version="1.3.3" />
1212
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
1313
<PackageReference Include="TiktokenSharp" Version="1.0.9" />
14+
<PackageReference Include="Microsoft.ML.Tokenizers" Version="0.22.0-preview.24179.1" />
1415
</ItemGroup>
1516

1617
<ItemGroup>

0 commit comments

Comments
 (0)