Initialize tokenizer outside the isolate

This commit is contained in:
vishnukvmd 2023-12-13 14:38:24 +05:30
parent f0f4f7f429
commit b2f9dd2c8b

View file

@ -1,12 +1,11 @@
import "dart:convert";
import "dart:math";
import "package:flutter/services.dart";
import "package:html_unescape/html_unescape.dart";
import "package:tuple/tuple.dart";
class OnnxTextTokenizer {
final String bpePath;
late String vocabulary;
late Map<int, String> byteEncoder;
late Map<String, int> byteDecoder;
late Map<int, String> decoder;
@ -27,15 +26,15 @@ class OnnxTextTokenizer {
late int sot;
late int eot;
OnnxTextTokenizer(this.bpePath);
OnnxTextTokenizer();
// Async method since the loadFile returns a Future and dart constructor cannot be async
Future init() async {
final bpe = await loadFile();
Future<void> init(String vocabulary) async {
this.vocabulary = vocabulary;
byteEncoder = bytesToUnicode();
byteDecoder = byteEncoder.map((k, v) => MapEntry(v, k));
var split = bpe.split('\n');
var split = vocabulary.split('\n');
split = split.sublist(1, 49152 - 256 - 2 + 1);
final merges = split
.map((merge) => Tuple2(merge.split(' ')[0], merge.split(' ')[1]))
@ -61,10 +60,6 @@ class OnnxTextTokenizer {
eot = encoder['<|endoftext|>']!;
}
Future<String> loadFile() async {
return await rootBundle.loadString(bpePath);
}
List<int> encode(String text) {
final List<int> bpeTokens = [];
text = whitespaceClean(basicClean(text)).toLowerCase();