Skip to content

Commit a8188f8

Browse files
authored
[ML] Fix Array out of bounds exception in the XLM Roberta tokenizer (#106655)
Increases the buffer size for the normalised form of the input unicode character. Certain characters can have surprisingly long normalised forms
1 parent d8fc877 commit a8188f8

File tree

3 files changed

+11
-4
lines changed

3 files changed

+11
-4
lines changed

docs/changelog/106655.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 106655
2+
summary: Fix Array out of bounds exception in the XLM Roberta tokenizer
3+
area: Machine Learning
4+
type: bug
5+
issues: []

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,8 @@ static Config fromBase64EncodedResource(String resourcePath) throws IOException
7373
private final int[] offsets;
7474
// The entire normalized bytes representations delimited by NULL
7575
private final byte[] normalizedStrUtf8Bytes;
76-
// Continually reused to copy a single char into utf8 bytes
77-
private final byte[] reusableCharByteBuffer = new byte[4];
7876
// reusable char buffer for decoding utf8 bytes to determine char offset corrections
79-
private final char[] reusableCharDecodeBuffer = new char[8];
77+
private final char[] reusableCharDecodeBuffer = new char[64];
8078
private Reader transformedInput;
8179

8280
public PrecompiledCharMapNormalizer(int[] offsets, String normalizedStr, Reader in) {
@@ -172,7 +170,6 @@ Reader normalize(CharSequence str) {
172170
ByteBuffer byteBuffer = StandardCharsets.UTF_8.encode(CharBuffer.wrap(str));
173171
byte[] strBytes = new byte[byteBuffer.limit()];
174172
byteBuffer.get(strBytes);
175-
int[] strCp = str.codePoints().toArray();
176173
BreakIterator b = BreakIterator.getCharacterInstance(Locale.ROOT);
177174
b.setText(str);
178175
// We iterate the whole string, so b.first() is always `0`

x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizerTests.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ public void testEmoji() throws IOException {
5757
assertNormalization("😀", parsed, "😀");
5858
}
5959

60+
public void testCharThatNormalizesToLongText() throws IOException {
61+
PrecompiledCharMapNormalizer.Config parsed = loadTestCharMap();
62+
assertNormalization("ﷺ", parsed, "صلى الله عليه وسلم");
63+
}
64+
6065
private void assertNormalization(String input, PrecompiledCharMapNormalizer.Config config, String expected) throws IOException {
6166
PrecompiledCharMapNormalizer normalizer = new PrecompiledCharMapNormalizer(
6267
config.offsets(),

0 commit comments

Comments
 (0)