| // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file |
| // for details. All rights reserved. Use of this source code is governed by a |
| // BSD-style license that can be found in the LICENSE file. |
| |
| library utf.utf8; |
| |
| import 'dart:collection'; |
| |
| import 'constants.dart'; |
| import 'list_range.dart'; |
| import 'shared.dart'; |
| |
| const int _UTF8_ONE_BYTE_MAX = 0x7f; |
| const int _UTF8_TWO_BYTE_MAX = 0x7ff; |
| const int _UTF8_THREE_BYTE_MAX = 0xffff; |
| |
| const int _UTF8_LO_SIX_BIT_MASK = 0x3f; |
| |
| const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; |
| const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; |
| const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; |
| const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; |
| const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; |
| |
| const int _UTF8_FIRST_BYTE_OF_TWO_MASK = 0x1f; |
| const int _UTF8_FIRST_BYTE_OF_THREE_MASK = 0xf; |
| const int _UTF8_FIRST_BYTE_OF_FOUR_MASK = 0x7; |
| |
| const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; |
| const int _UTF8_SUBSEQUENT_BYTE_BASE = 0x80; |
| |
| /// Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert |
| /// as much of the input as needed. Set the replacementCharacter to null to |
| /// throw an ArgumentError rather than replace the bad value. |
| IterableUtf8Decoder decodeUtf8AsIterable(List<int> bytes, |
| [int offset = 0, |
| int length, |
| int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| return IterableUtf8Decoder(bytes, offset, length, replacementCodepoint); |
| } |
| |
| /// Produce a String from a List of UTF-8 encoded bytes. The parameters |
| /// can set an offset into a list of bytes (as int), limit the length of the |
| /// values to be decoded, and override the default Unicode replacement character. |
| /// Set the replacementCharacter to null to throw an ArgumentError |
| /// rather than replace the bad value. |
| String decodeUtf8(List<int> bytes, |
| [int offset = 0, |
| int length, |
| int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| return String.fromCharCodes( |
| (Utf8Decoder(bytes, offset, length, replacementCodepoint)).decodeRest()); |
| } |
| |
| /// Produce a sequence of UTF-8 encoded bytes from the provided string. |
| List<int> encodeUtf8(String str) => codepointsToUtf8(stringToCodepoints(str)); |
| |
| int _addToEncoding(int offset, int bytes, int value, List<int> buffer) { |
| while (bytes > 0) { |
| buffer[offset + bytes] = |
| _UTF8_SUBSEQUENT_BYTE_BASE | (value & _UTF8_LO_SIX_BIT_MASK); |
| value = value >> 6; |
| bytes--; |
| } |
| return value; |
| } |
| |
| /// Encode code points as UTF-8 code units. |
| List<int> codepointsToUtf8(List<int> codepoints, [int offset = 0, int length]) { |
| var source = ListRange(codepoints, offset, length); |
| |
| var encodedLength = 0; |
| for (var value in source) { |
| if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
| encodedLength += 3; |
| } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| encodedLength++; |
| } else if (value <= _UTF8_TWO_BYTE_MAX) { |
| encodedLength += 2; |
| } else if (value <= _UTF8_THREE_BYTE_MAX) { |
| encodedLength += 3; |
| } else if (value <= UNICODE_VALID_RANGE_MAX) { |
| encodedLength += 4; |
| } |
| } |
| |
| var encoded = List<int>(encodedLength); |
| var insertAt = 0; |
| for (var value in source) { |
| if (value < 0 || value > UNICODE_VALID_RANGE_MAX) { |
| encoded.setRange(insertAt, insertAt + 3, [0xef, 0xbf, 0xbd]); |
| insertAt += 3; |
| } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| encoded[insertAt] = value; |
| insertAt++; |
| } else if (value <= _UTF8_TWO_BYTE_MAX) { |
| encoded[insertAt] = _UTF8_FIRST_BYTE_OF_TWO_BASE | |
| (_UTF8_FIRST_BYTE_OF_TWO_MASK & |
| _addToEncoding(insertAt, 1, value, encoded)); |
| insertAt += 2; |
| } else if (value <= _UTF8_THREE_BYTE_MAX) { |
| encoded[insertAt] = _UTF8_FIRST_BYTE_OF_THREE_BASE | |
| (_UTF8_FIRST_BYTE_OF_THREE_MASK & |
| _addToEncoding(insertAt, 2, value, encoded)); |
| insertAt += 3; |
| } else if (value <= UNICODE_VALID_RANGE_MAX) { |
| encoded[insertAt] = _UTF8_FIRST_BYTE_OF_FOUR_BASE | |
| (_UTF8_FIRST_BYTE_OF_FOUR_MASK & |
| _addToEncoding(insertAt, 3, value, encoded)); |
| insertAt += 4; |
| } |
| } |
| return encoded; |
| } |
| |
| // Because UTF-8 specifies byte order, we do not have to follow the pattern |
| // used by UTF-16 & UTF-32 regarding byte order. |
| List<int> utf8ToCodepoints(List<int> utf8EncodedBytes, |
| [int offset = 0, |
| int length, |
| int replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) { |
| return Utf8Decoder(utf8EncodedBytes, offset, length, replacementCodepoint) |
| .decodeRest(); |
| } |
| |
| /// Return type of [decodeUtf8AsIterable] and variants. The Iterable type |
| /// provides an iterator on demand and the iterator will only translate bytes |
| /// as requested by the user of the iterator. (Note: results are not cached.) |
| // TODO(floitsch): Consider removing the extend and switch to implements since |
| // that's cheaper to allocate. |
| class IterableUtf8Decoder extends IterableBase<int> { |
| final List<int> bytes; |
| final int offset; |
| @override |
| final int length; |
| final int replacementCodepoint; |
| |
| IterableUtf8Decoder(this.bytes, |
| [this.offset = 0, |
| this.length, |
| this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]); |
| |
| @override |
| Utf8Decoder get iterator => |
| Utf8Decoder(bytes, offset, length, replacementCodepoint); |
| } |
| |
| /// Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The |
| /// parameters can set an offset into a list of bytes (as int), limit the length |
| /// of the values to be decoded, and override the default Unicode replacement |
| /// character. Set the replacementCharacter to null to throw an |
| /// ArgumentError rather than replace the bad value. The return value |
| /// from this method can be used as an Iterable (e.g. in a for-loop). |
| class Utf8Decoder implements Iterator<int> { |
| // TODO(kevmoo): should this field be private? |
| final ListRangeIterator utf8EncodedBytesIterator; |
| final int replacementCodepoint; |
| int _current; |
| |
| Utf8Decoder(List<int> utf8EncodedBytes, |
| [int offset = 0, |
| int length, |
| this.replacementCodepoint = UNICODE_REPLACEMENT_CHARACTER_CODEPOINT]) |
| : utf8EncodedBytesIterator = |
| (ListRange(utf8EncodedBytes, offset, length)).iterator; |
| |
| // Decode the remainder of the characters in this decoder |
| //into a [List<int>]. |
| List<int> decodeRest() { |
| var codepoints = List<int>(utf8EncodedBytesIterator.remaining); |
| var i = 0; |
| while (moveNext()) { |
| codepoints[i++] = current; |
| } |
| if (i == codepoints.length) { |
| return codepoints; |
| } else { |
| var truncCodepoints = List<int>(i); |
| truncCodepoints.setRange(0, i, codepoints); |
| return truncCodepoints; |
| } |
| } |
| |
| @override |
| int get current => _current; |
| |
| @override |
| bool moveNext() { |
| _current = null; |
| |
| if (!utf8EncodedBytesIterator.moveNext()) return false; |
| |
| var value = utf8EncodedBytesIterator.current; |
| var additionalBytes = 0; |
| |
| if (value < 0) { |
| if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw ArgumentError( |
| 'Invalid UTF8 at ${utf8EncodedBytesIterator.position}'); |
| } |
| } else if (value <= _UTF8_ONE_BYTE_MAX) { |
| _current = value; |
| return true; |
| } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw ArgumentError( |
| 'Invalid UTF8 at ${utf8EncodedBytesIterator.position}'); |
| } |
| } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; |
| additionalBytes = 1; |
| } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; |
| additionalBytes = 2; |
| } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; |
| additionalBytes = 3; |
| } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { |
| value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; |
| additionalBytes = 4; |
| } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { |
| value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; |
| additionalBytes = 5; |
| } else if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw ArgumentError( |
| 'Invalid UTF8 at ${utf8EncodedBytesIterator.position}'); |
| } |
| var j = 0; |
| while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { |
| var nextValue = utf8EncodedBytesIterator.current; |
| if (nextValue > _UTF8_ONE_BYTE_MAX && |
| nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); |
| } else { |
| // if sequence-starting code unit, reposition cursor to start here |
| if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { |
| utf8EncodedBytesIterator.backup(); |
| } |
| break; |
| } |
| j++; |
| } |
| var validSequence = (j == additionalBytes && |
| (value < UNICODE_UTF16_RESERVED_LO || |
| value > UNICODE_UTF16_RESERVED_HI)); |
| var nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || |
| (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || |
| (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); |
| var inRange = value <= UNICODE_VALID_RANGE_MAX; |
| if (validSequence && nonOverlong && inRange) { |
| _current = value; |
| return true; |
| } else if (replacementCodepoint != null) { |
| _current = replacementCodepoint; |
| return true; |
| } else { |
| throw ArgumentError( |
| 'Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}'); |
| } |
| } |
| } |