1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
//
// RobertaTokenizerFast.swift
// iTexSnip
//
// Created by Navan Chauhan on 10/13/24.
//
import Foundation
class RobertaTokenizerFast {
var vocab: [String: Int] = [:]
var idToToken: [Int: String] = [:]
var specialTokens: [String] = []
var unkTokenId: Int?
init(vocabFile: String, tokenizerFile: String) {
if let vocabURL = Bundle.main.url(forResource: vocabFile, withExtension: "json"),
let vocabData = try? Data(contentsOf: vocabURL),
let vocabDict = try? JSONSerialization.jsonObject(with: vocabData, options: []) as? [String: Int] {
self.vocab = vocabDict
}
if let tokenizerURL = Bundle.main.url(forResource: tokenizerFile, withExtension: "json"),
let tokenizerData = try? Data(contentsOf: tokenizerURL),
let tokenizerConfig = try? JSONSerialization.jsonObject(with: tokenizerData, options: []) as? [String: Any] {
self.specialTokens = tokenizerConfig["added_tokens"] as? [String] ?? []
}
self.idToToken = vocab.reduce(into: [Int: String]()) { $0[$1.value] = $1.key }
self.unkTokenId = vocab["<unk>"]
}
func encode(text: String) -> [Int] {
let tokens = tokenize(text)
return tokens.map { vocab[$0] ?? unkTokenId! }
}
func decode(tokenIds: [Int], skipSpecialTokens: Bool = true) -> String {
let tokens = tokenIds.compactMap { idToToken[$0] }
let filteredTokens = skipSpecialTokens ? tokens.filter { !specialTokens.contains($0) && $0 != "</s>" } : tokens
return convertTokensToString(filteredTokens)
}
private func tokenize(_ text: String) -> [String] {
let cleanedText = cleanText(text)
let words = cleanedText.split(separator: " ").map { String($0) }
var tokens: [String] = []
for word in words {
tokens.append(contentsOf: bpeEncode(word))
}
return tokens
}
private func bpeEncode(_ word: String) -> [String] {
if vocab.keys.contains(word) {
return [word]
}
let chars = Array(word)
var tokens: [String] = []
var i = 0
while i < chars.count {
if i < chars.count - 1 {
let pair = String(chars[i]) + String(chars[i + 1])
if vocab.keys.contains(pair) {
tokens.append(pair)
i += 2
continue
}
}
tokens.append(String(chars[i]))
i += 1
}
return tokens
}
private func cleanText(_ text: String) -> String {
return text.trimmingCharacters(in: .whitespacesAndNewlines)
}
private func convertTokensToString(_ tokens: [String]) -> String {
let text = tokens.joined().replacingOccurrences(of: "Ġ", with: " ")
return text.replacingOccurrences(of: "\\s([?.!,\'\"](?:\\s|$))", with: "$1", options: .regularExpression, range: nil).trimmingCharacters(in: .whitespaces)
}
}
|