Loading...
Searching...
No Matches
unicodeUtils.h
Go to the documentation of this file.
1//
2// Copyright 2023 Pixar
3//
4// Licensed under the Apache License, Version 2.0 (the "Apache License")
5// with the following modification; you may not use this file except in
6// compliance with the Apache License and the following modification to it:
7// Section 6. Trademarks. is deleted and replaced with:
8//
9// 6. Trademarks. This License does not grant permission to use the trade
10// names, trademarks, service marks, or product names of the Licensor
11// and its affiliates, except as required to comply with Section 4(c) of
12// the License and to reproduce the content of the NOTICE file.
13//
14// You may obtain a copy of the Apache License at
15//
16// http://www.apache.org/licenses/LICENSE-2.0
17//
18// Unless required by applicable law or agreed to in writing, software
19// distributed under the Apache License with the above modification is
20// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21// KIND, either express or implied. See the Apache License for the specific
22// language governing permissions and limitations under the Apache License.
23//
24#ifndef PXR_BASE_TF_UNICODE_UTILS_H
25#define PXR_BASE_TF_UNICODE_UTILS_H
26
30
31#include "pxr/pxr.h"
32#include "pxr/base/tf/api.h"
34
35#include <ostream>
36#include <string>
37#include <string_view>
38
39PXR_NAMESPACE_OPEN_SCOPE
40
54public:
57 static constexpr uint32_t ReplacementValue = 0xFFFD;
58
61 static constexpr uint32_t MaximumValue = 0x10FFFF;
62
65 static constexpr std::pair<uint32_t, uint32_t>
66 SurrogateRange = {0xD800, 0xDFFF};
67
69 constexpr TfUtf8CodePoint() = default;
70
73 constexpr explicit TfUtf8CodePoint(uint32_t value) :
74 _value(((value <= MaximumValue) &&
75 ((value < SurrogateRange.first) ||
76 (value > SurrogateRange.second))) ?
77 value : ReplacementValue) {}
78
79 constexpr uint32_t AsUInt32() const { return _value; }
80
81 friend constexpr bool operator==(const TfUtf8CodePoint left,
82 const TfUtf8CodePoint right) {
83 return left._value == right._value;
84 }
85 friend constexpr bool operator!=(const TfUtf8CodePoint left,
86 const TfUtf8CodePoint right) {
87 return left._value != right._value;
88 }
89
90private:
91 uint32_t _value{ReplacementValue};
92};
93
94TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint);
95
100
102constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
103{
104 return static_cast<unsigned char>(value) < 128 ?
105 TfUtf8CodePoint(static_cast<unsigned char>(value)) :
107}
108
117public:
118 using iterator_category = std::forward_iterator_tag;
120 using difference_type = std::ptrdiff_t;
121 using pointer = void;
123
126 class PastTheEndSentinel final {};
127
136 const std::string_view::const_iterator& it,
137 const std::string_view::const_iterator& end) : _it(it), _end(end) {
138 TF_DEV_AXIOM(_it <= _end);
139 }
140
148 {
149 return TfUtf8CodePoint{_GetCodePoint()};
150 }
151
153 std::string_view::const_iterator GetBase() const
154 {
155 return this->_it;
156 }
157
163 {
164 return (this->_it == rhs._it);
165 }
166
171 bool operator!= (const TfUtf8CodePointIterator& rhs) const
172 {
173 return (this->_it != rhs._it);
174 }
175
182 {
183 // The increment operator should never be called if it's past
184 // the end. The user is expected to have already checked this
185 // condition.
186 TF_DEV_AXIOM(!_IsPastTheEnd());
187 _EncodingLength increment = _GetEncodingLength();
188 // Note that in cases where the encoding is invalid, we move to the
189 // next byte. This is necessary because otherwise the iterator would
190 // never advance and the end condition of == iterator::end() would
191 // never be satisfied. This means that we increment, even if the
192 // encoding length is 0.
193 ++_it;
194 // Only continuation bytes will be consumed after the the first byte.
195 // This avoids consumption of ASCII characters or other starting bytes.
196 auto isContinuation = [](const char c) {
197 const auto uc = static_cast<unsigned char>(c);
198 return (uc >= static_cast<unsigned char>('\x80')) &&
199 (uc < static_cast<unsigned char>('\xc0'));
200 };
201 while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
202 ++_it;
203 --increment;
204 }
205 return *this;
206 }
207
214 {
215 auto temp = *this;
216 ++(*this);
217 return temp;
218 }
219
222 friend bool operator==(const TfUtf8CodePointIterator& lhs,
224 {
225 return lhs._IsPastTheEnd();
226 }
227
228 friend bool operator==(PastTheEndSentinel lhs,
229 const TfUtf8CodePointIterator& rhs)
230 {
231 return rhs == lhs;
232 }
233
234 friend bool operator!=(const TfUtf8CodePointIterator& lhs,
235 PastTheEndSentinel rhs)
236 {
237 return !(lhs == rhs);
238 }
239 friend bool operator!=(PastTheEndSentinel lhs,
240 const TfUtf8CodePointIterator& rhs)
241 {
242 return !(lhs == rhs);
243 }
244
245private:
246 using _EncodingLength = unsigned char;
247
248 // Retrieves the variable encoding length of the UTF-8 character
249 // currently pointed to by the iterator. This can be 1, 2, 3, or 4
250 // depending on the encoding of the UTF-8 character. If the encoding
251 // cannot be determined, this method will return 0.
252 _EncodingLength _GetEncodingLength() const
253 {
254 // already at the end, no valid character sequence
255 if (_IsPastTheEnd())
256 {
257 return 0;
258 }
259 // determine what encoding length the character is
260 // 1-byte characters have a leading 0 sequence
261 // 2-byte characters have a leading 110 sequence
262 // 3-byte characters have a leading 1110 sequence
263 // 4-byte characters have a leading 11110 sequence
264 unsigned char x = static_cast<unsigned char>(*_it);
265 if (x < 0x80)
266 {
267 return 1;
268 }
269 else if ((x >= 0xc0) && (x < 0xe0))
270 {
271 return 2;
272 }
273 else if ((x >= 0xe0) && (x < 0xf0))
274 {
275 return 3;
276 }
277 else if ((x >= 0xf0) && (x < 0xf8))
278 {
279 return 4;
280 }
281 else
282 {
283 // can't determine encoding, this is an error
284 return 0;
285 }
286 }
287
288 // Retrieves the Unicode code point of the next character in the UTF-8
289 // encoded sequence (defined by \a begin) and returns the value in
290 // \a codePoint. This method will return \a true if the encoded
291 // sequence is valid. If the encoding is invalid, this method will
292 // return \a false and \a codePoint will be set to 0.
293 TF_API uint32_t _GetCodePoint() const;
294
295 // Returns true if the iterator at or past the end and can no longer be
296 // dereferenced.
297 bool _IsPastTheEnd() const
298 {
299 return _it >= _end;
300 }
301
302 std::string_view::const_iterator _it;
303 std::string_view::const_iterator _end;
304};
305
339public:
341
342 TfUtf8CodePointView() = default;
343 explicit TfUtf8CodePointView(const std::string_view& view) : _view(view) {}
344
345 inline const_iterator begin() const
346 {
347 return const_iterator{std::cbegin(_view), std::cend(_view)};
348 }
349
353 {
355 }
356
357 inline const_iterator cbegin() const
358 {
359 return begin();
360 }
361
365 {
366 return end();
367 }
368
370 bool empty() const
371 {
372 return _view.empty();
373 }
374
386 {
387 return const_iterator(std::cend(_view), std::cend(_view));
388 }
389
390private:
391 std::string_view _view;
392};
393
403TF_API
404bool TfIsUtf8CodePointXidStart(uint32_t codePoint);
405
410inline bool TfIsUtf8CodePointXidStart(const TfUtf8CodePoint codePoint)
411{
412 return TfIsUtf8CodePointXidStart(codePoint.AsUInt32());
413}
414
424TF_API
425bool TfIsUtf8CodePointXidContinue(uint32_t codePoint);
426
432{
433 return TfIsUtf8CodePointXidContinue(codePoint.AsUInt32());
434}
435
436PXR_NAMESPACE_CLOSE_SCOPE
437
438#endif // PXR_BASE_TF_UNICODE_UTILS_H_
Low-level utilities for informing users of various internal and external diagnostic conditions.
Wrapper for a 32-bit code point value that can be encoded as UTF-8.
Definition: unicodeUtils.h:53
static constexpr std::pair< uint32_t, uint32_t > SurrogateRange
Values in this range (inclusive) cannot be constructed and will be replaced by the replacement code p...
Definition: unicodeUtils.h:66
constexpr TfUtf8CodePoint(uint32_t value)
Construct a UTF-8 valued code point, constrained by the maximum value and surrogate range.
Definition: unicodeUtils.h:73
static constexpr uint32_t MaximumValue
Values higher than this will be replaced with the replacement code point.
Definition: unicodeUtils.h:61
constexpr TfUtf8CodePoint()=default
Construct a code point initialized to the replacement value.
static constexpr uint32_t ReplacementValue
Code points that cannot be decoded or are outside of the valid range will be replaced with this value...
Definition: unicodeUtils.h:57
Defines an iterator over a UTF-8 encoded string that extracts unicode code point values.
Definition: unicodeUtils.h:116
bool operator!=(const TfUtf8CodePointIterator &rhs) const
Determines if two iterators are unequal.
Definition: unicodeUtils.h:171
friend bool operator==(const TfUtf8CodePointIterator &lhs, PastTheEndSentinel)
Checks if the lhs iterator is at or past the end for the underlying string_view
Definition: unicodeUtils.h:222
value_type operator*() const
Retrieves the current UTF-8 character in the sequence as its Unicode code point value.
Definition: unicodeUtils.h:147
TfUtf8CodePointIterator & operator++()
Advances the iterator logically one UTF-8 character sequence in the string.
Definition: unicodeUtils.h:181
TfUtf8CodePointIterator(const std::string_view::const_iterator &it, const std::string_view::const_iterator &end)
Constructs an iterator that can read UTF-8 character sequences from the given starting string_view it...
Definition: unicodeUtils.h:135
std::string_view::const_iterator GetBase() const
Retrieves the wrapped string iterator.
Definition: unicodeUtils.h:153
Model iteration ending when the underlying iterator's end condition has been met.
Definition: unicodeUtils.h:126
Wrapper for a UTF-8 encoded std::string_view that can be iterated over as code points instead of byte...
Definition: unicodeUtils.h:338
bool empty() const
Returns true if the underlying view is empty.
Definition: unicodeUtils.h:370
TfUtf8CodePointIterator::PastTheEndSentinel end() const
The sentinel will compare as equal to any iterator at the end of the underlying string_view
Definition: unicodeUtils.h:352
TfUtf8CodePointIterator::PastTheEndSentinel cend() const
The sentinel will compare as equal to any iterator at the end of the underlying string_view
Definition: unicodeUtils.h:364
const_iterator EndAsIterator() const
Returns an iterator of the same type as begin that identifies the end of the string.
Definition: unicodeUtils.h:385
GF_API std::ostream & operator<<(std::ostream &, const GfBBox3d &)
Output a GfBBox3d using the format [(range) matrix zeroArea].
#define TF_DEV_AXIOM(cond)
The same as TF_AXIOM, but compiled only in dev builds.
Definition: diagnostic.h:222
constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint
The replacement code point can be used to signal that a code point could not be decoded and needed to...
Definition: unicodeUtils.h:98
TF_API bool TfIsUtf8CodePointXidContinue(uint32_t codePoint)
Determines whether the given Unicode codePoint is in the XID_Continue character class.
constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
Definition: unicodeUtils.h:102
TF_API bool TfIsUtf8CodePointXidStart(uint32_t codePoint)
Determines whether the given Unicode codePoint is in the XID_Start character class.