Skip to content

Commit bd78c6e

Browse files
richardlauBethGriggs
authored andcommittedJun 2, 2020
deps: backport ICU-20958 to fix CVE-2020-10531
Add floating patch for ICU 64.2 from unicode-org/icu@18b212f. Original commit message: ICU-21032 Backport to 64.x: ICU-20958 Prevent SEGV_MAPERR in append See #971 (cherry picked from commit b7d08bc04a4296982fcef8b6b8a354a9e4e7afca) Refs: https://unicode-org.atlassian.net/browse/ICU-20958 Refs: unicode-org/icu#1155 CVE-ID: CVE-2020-10531 PR-URL: #33572 Reviewed-By: Beth Griggs <Bethany.Griggs@uk.ibm.com> Reviewed-By: Steven R Loomis <srloomis@us.ibm.com>
1 parent 881c244 commit bd78c6e

File tree

1 file changed

+1982
-0
lines changed

1 file changed

+1982
-0
lines changed
 
+1,982
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,1982 @@
1+
// © 2016 and later: Unicode, Inc. and others.
2+
// License & terms of use: http://www.unicode.org/copyright.html
3+
/*
4+
******************************************************************************
5+
* Copyright (C) 1999-2016, International Business Machines Corporation and
6+
* others. All Rights Reserved.
7+
******************************************************************************
8+
*
9+
* File unistr.cpp
10+
*
11+
* Modification History:
12+
*
13+
* Date Name Description
14+
* 09/25/98 stephen Creation.
15+
* 04/20/99 stephen Overhauled per 4/16 code review.
16+
* 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17+
* 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18+
* Replaceable.
19+
* 06/25/01 grhoten Removed the dependency on iostream
20+
******************************************************************************
21+
*/
22+
23+
#include "unicode/utypes.h"
24+
#include "unicode/appendable.h"
25+
#include "unicode/putil.h"
26+
#include "cstring.h"
27+
#include "cmemory.h"
28+
#include "unicode/ustring.h"
29+
#include "unicode/unistr.h"
30+
#include "unicode/utf.h"
31+
#include "unicode/utf16.h"
32+
#include "uelement.h"
33+
#include "ustr_imp.h"
34+
#include "umutex.h"
35+
#include "uassert.h"
36+
37+
#if 0
38+
39+
#include <iostream>
40+
using namespace std;
41+
42+
//DEBUGGING
43+
void
44+
print(const UnicodeString& s,
45+
const char *name)
46+
{
47+
UChar c;
48+
cout << name << ":|";
49+
for(int i = 0; i < s.length(); ++i) {
50+
c = s[i];
51+
if(c>= 0x007E || c < 0x0020)
52+
cout << "[0x" << hex << s[i] << "]";
53+
else
54+
cout << (char) s[i];
55+
}
56+
cout << '|' << endl;
57+
}
58+
59+
void
60+
print(const UChar *s,
61+
int32_t len,
62+
const char *name)
63+
{
64+
UChar c;
65+
cout << name << ":|";
66+
for(int i = 0; i < len; ++i) {
67+
c = s[i];
68+
if(c>= 0x007E || c < 0x0020)
69+
cout << "[0x" << hex << s[i] << "]";
70+
else
71+
cout << (char) s[i];
72+
}
73+
cout << '|' << endl;
74+
}
75+
// END DEBUGGING
76+
#endif
77+
78+
// Local function definitions for now
79+
80+
// need to copy areas that may overlap
81+
static
82+
inline void
83+
us_arrayCopy(const UChar *src, int32_t srcStart,
84+
UChar *dst, int32_t dstStart, int32_t count)
85+
{
86+
if(count>0) {
87+
uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88+
}
89+
}
90+
91+
// u_unescapeAt() callback to get a UChar from a UnicodeString
92+
U_CDECL_BEGIN
93+
static UChar U_CALLCONV
94+
UnicodeString_charAt(int32_t offset, void *context) {
95+
return ((icu::UnicodeString*) context)->charAt(offset);
96+
}
97+
U_CDECL_END
98+
99+
U_NAMESPACE_BEGIN
100+
101+
/* The Replaceable virtual destructor can't be defined in the header
102+
due to how AIX works with multiple definitions of virtual functions.
103+
*/
104+
Replaceable::~Replaceable() {}
105+
106+
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107+
108+
UnicodeString U_EXPORT2
109+
operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110+
return
111+
UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112+
append(s1).
113+
append(s2);
114+
}
115+
116+
//========================================
117+
// Reference Counting functions, put at top of file so that optimizing compilers
118+
// have a chance to automatically inline.
119+
//========================================
120+
121+
void
122+
UnicodeString::addRef() {
123+
umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124+
}
125+
126+
int32_t
127+
UnicodeString::removeRef() {
128+
return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129+
}
130+
131+
int32_t
132+
UnicodeString::refCount() const {
133+
return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134+
}
135+
136+
void
137+
UnicodeString::releaseArray() {
138+
if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139+
uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140+
}
141+
}
142+
143+
144+
145+
//========================================
146+
// Constructors
147+
//========================================
148+
149+
// The default constructor is inline in unistr.h.
150+
151+
UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152+
fUnion.fFields.fLengthAndFlags = 0;
153+
if(count <= 0 || (uint32_t)c > 0x10ffff) {
154+
// just allocate and do not do anything else
155+
allocate(capacity);
156+
} else if(c <= 0xffff) {
157+
int32_t length = count;
158+
if(capacity < length) {
159+
capacity = length;
160+
}
161+
if(allocate(capacity)) {
162+
UChar *array = getArrayStart();
163+
UChar unit = (UChar)c;
164+
for(int32_t i = 0; i < length; ++i) {
165+
array[i] = unit;
166+
}
167+
setLength(length);
168+
}
169+
} else { // supplementary code point, write surrogate pairs
170+
if(count > (INT32_MAX / 2)) {
171+
// We would get more than 2G UChars.
172+
allocate(capacity);
173+
return;
174+
}
175+
int32_t length = count * 2;
176+
if(capacity < length) {
177+
capacity = length;
178+
}
179+
if(allocate(capacity)) {
180+
UChar *array = getArrayStart();
181+
UChar lead = U16_LEAD(c);
182+
UChar trail = U16_TRAIL(c);
183+
for(int32_t i = 0; i < length; i += 2) {
184+
array[i] = lead;
185+
array[i + 1] = trail;
186+
}
187+
setLength(length);
188+
}
189+
}
190+
}
191+
192+
UnicodeString::UnicodeString(UChar ch) {
193+
fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194+
fUnion.fStackFields.fBuffer[0] = ch;
195+
}
196+
197+
UnicodeString::UnicodeString(UChar32 ch) {
198+
fUnion.fFields.fLengthAndFlags = kShortString;
199+
int32_t i = 0;
200+
UBool isError = FALSE;
201+
U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202+
// We test isError so that the compiler does not complain that we don't.
203+
// If isError then i==0 which is what we want anyway.
204+
if(!isError) {
205+
setShortLength(i);
206+
}
207+
}
208+
209+
UnicodeString::UnicodeString(const UChar *text) {
210+
fUnion.fFields.fLengthAndFlags = kShortString;
211+
doAppend(text, 0, -1);
212+
}
213+
214+
UnicodeString::UnicodeString(const UChar *text,
215+
int32_t textLength) {
216+
fUnion.fFields.fLengthAndFlags = kShortString;
217+
doAppend(text, 0, textLength);
218+
}
219+
220+
UnicodeString::UnicodeString(UBool isTerminated,
221+
ConstChar16Ptr textPtr,
222+
int32_t textLength) {
223+
fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224+
const UChar *text = textPtr;
225+
if(text == NULL) {
226+
// treat as an empty string, do not alias
227+
setToEmpty();
228+
} else if(textLength < -1 ||
229+
(textLength == -1 && !isTerminated) ||
230+
(textLength >= 0 && isTerminated && text[textLength] != 0)
231+
) {
232+
setToBogus();
233+
} else {
234+
if(textLength == -1) {
235+
// text is terminated, or else it would have failed the above test
236+
textLength = u_strlen(text);
237+
}
238+
setArray(const_cast<UChar *>(text), textLength,
239+
isTerminated ? textLength + 1 : textLength);
240+
}
241+
}
242+
243+
UnicodeString::UnicodeString(UChar *buff,
244+
int32_t buffLength,
245+
int32_t buffCapacity) {
246+
fUnion.fFields.fLengthAndFlags = kWritableAlias;
247+
if(buff == NULL) {
248+
// treat as an empty string, do not alias
249+
setToEmpty();
250+
} else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251+
setToBogus();
252+
} else {
253+
if(buffLength == -1) {
254+
// fLength = u_strlen(buff); but do not look beyond buffCapacity
255+
const UChar *p = buff, *limit = buff + buffCapacity;
256+
while(p != limit && *p != 0) {
257+
++p;
258+
}
259+
buffLength = (int32_t)(p - buff);
260+
}
261+
setArray(buff, buffLength, buffCapacity);
262+
}
263+
}
264+
265+
UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266+
fUnion.fFields.fLengthAndFlags = kShortString;
267+
if(src==NULL) {
268+
// treat as an empty string
269+
} else {
270+
if(length<0) {
271+
length=(int32_t)uprv_strlen(src);
272+
}
273+
if(cloneArrayIfNeeded(length, length, FALSE)) {
274+
u_charsToUChars(src, getArrayStart(), length);
275+
setLength(length);
276+
} else {
277+
setToBogus();
278+
}
279+
}
280+
}
281+
282+
#if U_CHARSET_IS_UTF8
283+
284+
UnicodeString::UnicodeString(const char *codepageData) {
285+
fUnion.fFields.fLengthAndFlags = kShortString;
286+
if(codepageData != 0) {
287+
setToUTF8(codepageData);
288+
}
289+
}
290+
291+
UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292+
fUnion.fFields.fLengthAndFlags = kShortString;
293+
// if there's nothing to convert, do nothing
294+
if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295+
return;
296+
}
297+
if(dataLength == -1) {
298+
dataLength = (int32_t)uprv_strlen(codepageData);
299+
}
300+
setToUTF8(StringPiece(codepageData, dataLength));
301+
}
302+
303+
// else see unistr_cnv.cpp
304+
#endif
305+
306+
UnicodeString::UnicodeString(const UnicodeString& that) {
307+
fUnion.fFields.fLengthAndFlags = kShortString;
308+
copyFrom(that);
309+
}
310+
311+
UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312+
copyFieldsFrom(src, TRUE);
313+
}
314+
315+
UnicodeString::UnicodeString(const UnicodeString& that,
316+
int32_t srcStart) {
317+
fUnion.fFields.fLengthAndFlags = kShortString;
318+
setTo(that, srcStart);
319+
}
320+
321+
UnicodeString::UnicodeString(const UnicodeString& that,
322+
int32_t srcStart,
323+
int32_t srcLength) {
324+
fUnion.fFields.fLengthAndFlags = kShortString;
325+
setTo(that, srcStart, srcLength);
326+
}
327+
328+
// Replaceable base class clone() default implementation, does not clone
329+
Replaceable *
330+
Replaceable::clone() const {
331+
return NULL;
332+
}
333+
334+
// UnicodeString overrides clone() with a real implementation
335+
Replaceable *
336+
UnicodeString::clone() const {
337+
return new UnicodeString(*this);
338+
}
339+
340+
//========================================
341+
// array allocation
342+
//========================================
343+
344+
namespace {
345+
346+
const int32_t kGrowSize = 128;
347+
348+
// The number of bytes for one int32_t reference counter and capacity UChars
349+
// must fit into a 32-bit size_t (at least when on a 32-bit platform).
350+
// We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
351+
// and round up to a multiple of 16 bytes.
352+
// This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
353+
// (With more complicated checks we could go up to 0x7ffffffd without rounding up,
354+
// but that does not seem worth it.)
355+
const int32_t kMaxCapacity = 0x7ffffff5;
356+
357+
int32_t getGrowCapacity(int32_t newLength) {
358+
int32_t growSize = (newLength >> 2) + kGrowSize;
359+
if(growSize <= (kMaxCapacity - newLength)) {
360+
return newLength + growSize;
361+
} else {
362+
return kMaxCapacity;
363+
}
364+
}
365+
366+
} // namespace
367+
368+
UBool
369+
UnicodeString::allocate(int32_t capacity) {
370+
if(capacity <= US_STACKBUF_SIZE) {
371+
fUnion.fFields.fLengthAndFlags = kShortString;
372+
return TRUE;
373+
}
374+
if(capacity <= kMaxCapacity) {
375+
++capacity; // for the NUL
376+
// Switch to size_t which is unsigned so that we can allocate up to 4GB.
377+
// Reference counter + UChars.
378+
size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
379+
// Round up to a multiple of 16.
380+
numBytes = (numBytes + 15) & ~15;
381+
int32_t *array = (int32_t *) uprv_malloc(numBytes);
382+
if(array != NULL) {
383+
// set initial refCount and point behind the refCount
384+
*array++ = 1;
385+
numBytes -= sizeof(int32_t);
386+
387+
// have fArray point to the first UChar
388+
fUnion.fFields.fArray = (UChar *)array;
389+
fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
390+
fUnion.fFields.fLengthAndFlags = kLongString;
391+
return TRUE;
392+
}
393+
}
394+
fUnion.fFields.fLengthAndFlags = kIsBogus;
395+
fUnion.fFields.fArray = 0;
396+
fUnion.fFields.fCapacity = 0;
397+
return FALSE;
398+
}
399+
400+
//========================================
401+
// Destructor
402+
//========================================
403+
404+
#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
405+
static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
406+
static u_atomic_int32_t beyondCount(0);
407+
408+
U_CAPI void unistr_printLengths() {
409+
int32_t i;
410+
for(i = 0; i <= 59; ++i) {
411+
printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
412+
}
413+
int32_t beyond = beyondCount;
414+
for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
415+
beyond += finalLengthCounts[i];
416+
}
417+
printf(">59, %9d\n", beyond);
418+
}
419+
#endif
420+
421+
UnicodeString::~UnicodeString()
422+
{
423+
#ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
424+
// Count lengths of strings at the end of their lifetime.
425+
// Useful for discussion of a desirable stack buffer size.
426+
// Count the contents length, not the optional NUL terminator nor further capacity.
427+
// Ignore open-buffer strings and strings which alias external storage.
428+
if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
429+
if(hasShortLength()) {
430+
umtx_atomic_inc(finalLengthCounts + getShortLength());
431+
} else {
432+
umtx_atomic_inc(&beyondCount);
433+
}
434+
}
435+
#endif
436+
437+
releaseArray();
438+
}
439+
440+
//========================================
441+
// Factory methods
442+
//========================================
443+
444+
UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
445+
UnicodeString result;
446+
result.setToUTF8(utf8);
447+
return result;
448+
}
449+
450+
UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
451+
UnicodeString result;
452+
int32_t capacity;
453+
// Most UTF-32 strings will be BMP-only and result in a same-length
454+
// UTF-16 string. We overestimate the capacity just slightly,
455+
// just in case there are a few supplementary characters.
456+
if(length <= US_STACKBUF_SIZE) {
457+
capacity = US_STACKBUF_SIZE;
458+
} else {
459+
capacity = length + (length >> 4) + 4;
460+
}
461+
do {
462+
UChar *utf16 = result.getBuffer(capacity);
463+
int32_t length16;
464+
UErrorCode errorCode = U_ZERO_ERROR;
465+
u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
466+
utf32, length,
467+
0xfffd, // Substitution character.
468+
NULL, // Don't care about number of substitutions.
469+
&errorCode);
470+
result.releaseBuffer(length16);
471+
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
472+
capacity = length16 + 1; // +1 for the terminating NUL.
473+
continue;
474+
} else if(U_FAILURE(errorCode)) {
475+
result.setToBogus();
476+
}
477+
break;
478+
} while(TRUE);
479+
return result;
480+
}
481+
482+
//========================================
483+
// Assignment
484+
//========================================
485+
486+
UnicodeString &
487+
UnicodeString::operator=(const UnicodeString &src) {
488+
return copyFrom(src);
489+
}
490+
491+
UnicodeString &
492+
UnicodeString::fastCopyFrom(const UnicodeString &src) {
493+
return copyFrom(src, TRUE);
494+
}
495+
496+
UnicodeString &
497+
UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
498+
// if assigning to ourselves, do nothing
499+
if(this == &src) {
500+
return *this;
501+
}
502+
503+
// is the right side bogus?
504+
if(src.isBogus()) {
505+
setToBogus();
506+
return *this;
507+
}
508+
509+
// delete the current contents
510+
releaseArray();
511+
512+
if(src.isEmpty()) {
513+
// empty string - use the stack buffer
514+
setToEmpty();
515+
return *this;
516+
}
517+
518+
// fLength>0 and not an "open" src.getBuffer(minCapacity)
519+
fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
520+
switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
521+
case kShortString:
522+
// short string using the stack buffer, do the same
523+
uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
524+
getShortLength() * U_SIZEOF_UCHAR);
525+
break;
526+
case kLongString:
527+
// src uses a refCounted string buffer, use that buffer with refCount
528+
// src is const, use a cast - we don't actually change it
529+
((UnicodeString &)src).addRef();
530+
// copy all fields, share the reference-counted buffer
531+
fUnion.fFields.fArray = src.fUnion.fFields.fArray;
532+
fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
533+
if(!hasShortLength()) {
534+
fUnion.fFields.fLength = src.fUnion.fFields.fLength;
535+
}
536+
break;
537+
case kReadonlyAlias:
538+
if(fastCopy) {
539+
// src is a readonly alias, do the same
540+
// -> maintain the readonly alias as such
541+
fUnion.fFields.fArray = src.fUnion.fFields.fArray;
542+
fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
543+
if(!hasShortLength()) {
544+
fUnion.fFields.fLength = src.fUnion.fFields.fLength;
545+
}
546+
break;
547+
}
548+
// else if(!fastCopy) fall through to case kWritableAlias
549+
// -> allocate a new buffer and copy the contents
550+
U_FALLTHROUGH;
551+
case kWritableAlias: {
552+
// src is a writable alias; we make a copy of that instead
553+
int32_t srcLength = src.length();
554+
if(allocate(srcLength)) {
555+
u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
556+
setLength(srcLength);
557+
break;
558+
}
559+
// if there is not enough memory, then fall through to setting to bogus
560+
U_FALLTHROUGH;
561+
}
562+
default:
563+
// if src is bogus, set ourselves to bogus
564+
// do not call setToBogus() here because fArray and flags are not consistent here
565+
fUnion.fFields.fLengthAndFlags = kIsBogus;
566+
fUnion.fFields.fArray = 0;
567+
fUnion.fFields.fCapacity = 0;
568+
break;
569+
}
570+
571+
return *this;
572+
}
573+
574+
UnicodeString &UnicodeString::operator=(UnicodeString &&src) U_NOEXCEPT {
575+
// No explicit check for self move assignment, consistent with standard library.
576+
// Self move assignment causes no crash nor leak but might make the object bogus.
577+
releaseArray();
578+
copyFieldsFrom(src, TRUE);
579+
return *this;
580+
}
581+
582+
// Same as move assignment except without memory management.
583+
void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
584+
int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
585+
if(lengthAndFlags & kUsingStackBuffer) {
586+
// Short string using the stack buffer, copy the contents.
587+
// Check for self assignment to prevent "overlap in memcpy" warnings,
588+
// although it should be harmless to copy a buffer to itself exactly.
589+
if(this != &src) {
590+
uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
591+
getShortLength() * U_SIZEOF_UCHAR);
592+
}
593+
} else {
594+
// In all other cases, copy all fields.
595+
fUnion.fFields.fArray = src.fUnion.fFields.fArray;
596+
fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
597+
if(!hasShortLength()) {
598+
fUnion.fFields.fLength = src.fUnion.fFields.fLength;
599+
}
600+
if(setSrcToBogus) {
601+
// Set src to bogus without releasing any memory.
602+
src.fUnion.fFields.fLengthAndFlags = kIsBogus;
603+
src.fUnion.fFields.fArray = NULL;
604+
src.fUnion.fFields.fCapacity = 0;
605+
}
606+
}
607+
}
608+
609+
void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
610+
UnicodeString temp; // Empty short string: Known not to need releaseArray().
611+
// Copy fields without resetting source values in between.
612+
temp.copyFieldsFrom(*this, FALSE);
613+
this->copyFieldsFrom(other, FALSE);
614+
other.copyFieldsFrom(temp, FALSE);
615+
// Set temp to an empty string so that other's memory is not released twice.
616+
temp.fUnion.fFields.fLengthAndFlags = kShortString;
617+
}
618+
619+
//========================================
620+
// Miscellaneous operations
621+
//========================================
622+
623+
UnicodeString UnicodeString::unescape() const {
624+
UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
625+
if (result.isBogus()) {
626+
return result;
627+
}
628+
const UChar *array = getBuffer();
629+
int32_t len = length();
630+
int32_t prev = 0;
631+
for (int32_t i=0;;) {
632+
if (i == len) {
633+
result.append(array, prev, len - prev);
634+
break;
635+
}
636+
if (array[i++] == 0x5C /*'\\'*/) {
637+
result.append(array, prev, (i - 1) - prev);
638+
UChar32 c = unescapeAt(i); // advances i
639+
if (c < 0) {
640+
result.remove(); // return empty string
641+
break; // invalid escape sequence
642+
}
643+
result.append(c);
644+
prev = i;
645+
}
646+
}
647+
return result;
648+
}
649+
650+
UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
651+
return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
652+
}
653+
654+
//========================================
655+
// Read-only implementation
656+
//========================================
657+
UBool
658+
UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
659+
// Requires: this & text not bogus and have same lengths.
660+
// Byte-wise comparison works for equality regardless of endianness.
661+
return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
662+
}
663+
664+
int8_t
665+
UnicodeString::doCompare( int32_t start,
666+
int32_t length,
667+
const UChar *srcChars,
668+
int32_t srcStart,
669+
int32_t srcLength) const
670+
{
671+
// compare illegal string values
672+
if(isBogus()) {
673+
return -1;
674+
}
675+
676+
// pin indices to legal values
677+
pinIndices(start, length);
678+
679+
if(srcChars == NULL) {
680+
// treat const UChar *srcChars==NULL as an empty string
681+
return length == 0 ? 0 : 1;
682+
}
683+
684+
// get the correct pointer
685+
const UChar *chars = getArrayStart();
686+
687+
chars += start;
688+
srcChars += srcStart;
689+
690+
int32_t minLength;
691+
int8_t lengthResult;
692+
693+
// get the srcLength if necessary
694+
if(srcLength < 0) {
695+
srcLength = u_strlen(srcChars + srcStart);
696+
}
697+
698+
// are we comparing different lengths?
699+
if(length != srcLength) {
700+
if(length < srcLength) {
701+
minLength = length;
702+
lengthResult = -1;
703+
} else {
704+
minLength = srcLength;
705+
lengthResult = 1;
706+
}
707+
} else {
708+
minLength = length;
709+
lengthResult = 0;
710+
}
711+
712+
/*
713+
* note that uprv_memcmp() returns an int but we return an int8_t;
714+
* we need to take care not to truncate the result -
715+
* one way to do this is to right-shift the value to
716+
* move the sign bit into the lower 8 bits and making sure that this
717+
* does not become 0 itself
718+
*/
719+
720+
if(minLength > 0 && chars != srcChars) {
721+
int32_t result;
722+
723+
# if U_IS_BIG_ENDIAN
724+
// big-endian: byte comparison works
725+
result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
726+
if(result != 0) {
727+
return (int8_t)(result >> 15 | 1);
728+
}
729+
# else
730+
// little-endian: compare UChar units
731+
do {
732+
result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
733+
if(result != 0) {
734+
return (int8_t)(result >> 15 | 1);
735+
}
736+
} while(--minLength > 0);
737+
# endif
738+
}
739+
return lengthResult;
740+
}
741+
742+
/* String compare in code point order - doCompare() compares in code unit order. */
743+
int8_t
744+
UnicodeString::doCompareCodePointOrder(int32_t start,
745+
int32_t length,
746+
const UChar *srcChars,
747+
int32_t srcStart,
748+
int32_t srcLength) const
749+
{
750+
// compare illegal string values
751+
// treat const UChar *srcChars==NULL as an empty string
752+
if(isBogus()) {
753+
return -1;
754+
}
755+
756+
// pin indices to legal values
757+
pinIndices(start, length);
758+
759+
if(srcChars == NULL) {
760+
srcStart = srcLength = 0;
761+
}
762+
763+
int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
764+
/* translate the 32-bit result into an 8-bit one */
765+
if(diff!=0) {
766+
return (int8_t)(diff >> 15 | 1);
767+
} else {
768+
return 0;
769+
}
770+
}
771+
772+
int32_t
773+
UnicodeString::getLength() const {
774+
return length();
775+
}
776+
777+
UChar
778+
UnicodeString::getCharAt(int32_t offset) const {
779+
return charAt(offset);
780+
}
781+
782+
UChar32
783+
UnicodeString::getChar32At(int32_t offset) const {
784+
return char32At(offset);
785+
}
786+
787+
UChar32
788+
UnicodeString::char32At(int32_t offset) const
789+
{
790+
int32_t len = length();
791+
if((uint32_t)offset < (uint32_t)len) {
792+
const UChar *array = getArrayStart();
793+
UChar32 c;
794+
U16_GET(array, 0, offset, len, c);
795+
return c;
796+
} else {
797+
return kInvalidUChar;
798+
}
799+
}
800+
801+
int32_t
802+
UnicodeString::getChar32Start(int32_t offset) const {
803+
if((uint32_t)offset < (uint32_t)length()) {
804+
const UChar *array = getArrayStart();
805+
U16_SET_CP_START(array, 0, offset);
806+
return offset;
807+
} else {
808+
return 0;
809+
}
810+
}
811+
812+
int32_t
813+
UnicodeString::getChar32Limit(int32_t offset) const {
814+
int32_t len = length();
815+
if((uint32_t)offset < (uint32_t)len) {
816+
const UChar *array = getArrayStart();
817+
U16_SET_CP_LIMIT(array, 0, offset, len);
818+
return offset;
819+
} else {
820+
return len;
821+
}
822+
}
823+
824+
int32_t
825+
UnicodeString::countChar32(int32_t start, int32_t length) const {
826+
pinIndices(start, length);
827+
// if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
828+
return u_countChar32(getArrayStart()+start, length);
829+
}
830+
831+
UBool
832+
UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
833+
pinIndices(start, length);
834+
// if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
835+
return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
836+
}
837+
838+
int32_t
839+
UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
840+
// pin index
841+
int32_t len = length();
842+
if(index<0) {
843+
index=0;
844+
} else if(index>len) {
845+
index=len;
846+
}
847+
848+
const UChar *array = getArrayStart();
849+
if(delta>0) {
850+
U16_FWD_N(array, index, len, delta);
851+
} else {
852+
U16_BACK_N(array, 0, index, -delta);
853+
}
854+
855+
return index;
856+
}
857+
858+
void
859+
UnicodeString::doExtract(int32_t start,
860+
int32_t length,
861+
UChar *dst,
862+
int32_t dstStart) const
863+
{
864+
// pin indices to legal values
865+
pinIndices(start, length);
866+
867+
// do not copy anything if we alias dst itself
868+
const UChar *array = getArrayStart();
869+
if(array + start != dst + dstStart) {
870+
us_arrayCopy(array, start, dst, dstStart, length);
871+
}
872+
}
873+
874+
int32_t
875+
UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
876+
UErrorCode &errorCode) const {
877+
int32_t len = length();
878+
if(U_SUCCESS(errorCode)) {
879+
if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
880+
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
881+
} else {
882+
const UChar *array = getArrayStart();
883+
if(len>0 && len<=destCapacity && array!=dest) {
884+
u_memcpy(dest, array, len);
885+
}
886+
return u_terminateUChars(dest, destCapacity, len, &errorCode);
887+
}
888+
}
889+
890+
return len;
891+
}
892+
893+
int32_t
894+
UnicodeString::extract(int32_t start,
895+
int32_t length,
896+
char *target,
897+
int32_t targetCapacity,
898+
enum EInvariant) const
899+
{
900+
// if the arguments are illegal, then do nothing
901+
if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
902+
return 0;
903+
}
904+
905+
// pin the indices to legal values
906+
pinIndices(start, length);
907+
908+
if(length <= targetCapacity) {
909+
u_UCharsToChars(getArrayStart() + start, target, length);
910+
}
911+
UErrorCode status = U_ZERO_ERROR;
912+
return u_terminateChars(target, targetCapacity, length, &status);
913+
}
914+
915+
UnicodeString
916+
UnicodeString::tempSubString(int32_t start, int32_t len) const {
917+
pinIndices(start, len);
918+
const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
919+
if(array==NULL) {
920+
array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
921+
len=-2; // bogus result string
922+
}
923+
return UnicodeString(FALSE, array + start, len);
924+
}
925+
926+
int32_t
927+
UnicodeString::toUTF8(int32_t start, int32_t len,
928+
char *target, int32_t capacity) const {
929+
pinIndices(start, len);
930+
int32_t length8;
931+
UErrorCode errorCode = U_ZERO_ERROR;
932+
u_strToUTF8WithSub(target, capacity, &length8,
933+
getBuffer() + start, len,
934+
0xFFFD, // Standard substitution character.
935+
NULL, // Don't care about number of substitutions.
936+
&errorCode);
937+
return length8;
938+
}
939+
940+
#if U_CHARSET_IS_UTF8
941+
942+
int32_t
943+
UnicodeString::extract(int32_t start, int32_t len,
944+
char *target, uint32_t dstSize) const {
945+
// if the arguments are illegal, then do nothing
946+
if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
947+
return 0;
948+
}
949+
return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
950+
}
951+
952+
// else see unistr_cnv.cpp
953+
#endif
954+
955+
void
956+
UnicodeString::extractBetween(int32_t start,
957+
int32_t limit,
958+
UnicodeString& target) const {
959+
pinIndex(start);
960+
pinIndex(limit);
961+
doExtract(start, limit - start, target);
962+
}
963+
964+
// When converting from UTF-16 to UTF-8, the result will have at most 3 times
965+
// as many bytes as the source has UChars.
966+
// The "worst cases" are writing systems like Indic, Thai and CJK with
967+
// 3:1 bytes:UChars.
968+
void
969+
UnicodeString::toUTF8(ByteSink &sink) const {
970+
int32_t length16 = length();
971+
if(length16 != 0) {
972+
char stackBuffer[1024];
973+
int32_t capacity = (int32_t)sizeof(stackBuffer);
974+
UBool utf8IsOwned = FALSE;
975+
char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
976+
3*length16,
977+
stackBuffer, capacity,
978+
&capacity);
979+
int32_t length8 = 0;
980+
UErrorCode errorCode = U_ZERO_ERROR;
981+
u_strToUTF8WithSub(utf8, capacity, &length8,
982+
getBuffer(), length16,
983+
0xFFFD, // Standard substitution character.
984+
NULL, // Don't care about number of substitutions.
985+
&errorCode);
986+
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
987+
utf8 = (char *)uprv_malloc(length8);
988+
if(utf8 != NULL) {
989+
utf8IsOwned = TRUE;
990+
errorCode = U_ZERO_ERROR;
991+
u_strToUTF8WithSub(utf8, length8, &length8,
992+
getBuffer(), length16,
993+
0xFFFD, // Standard substitution character.
994+
NULL, // Don't care about number of substitutions.
995+
&errorCode);
996+
} else {
997+
errorCode = U_MEMORY_ALLOCATION_ERROR;
998+
}
999+
}
1000+
if(U_SUCCESS(errorCode)) {
1001+
sink.Append(utf8, length8);
1002+
sink.Flush();
1003+
}
1004+
if(utf8IsOwned) {
1005+
uprv_free(utf8);
1006+
}
1007+
}
1008+
}
1009+
1010+
int32_t
1011+
UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1012+
int32_t length32=0;
1013+
if(U_SUCCESS(errorCode)) {
1014+
// getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1015+
u_strToUTF32WithSub(utf32, capacity, &length32,
1016+
getBuffer(), length(),
1017+
0xfffd, // Substitution character.
1018+
NULL, // Don't care about number of substitutions.
1019+
&errorCode);
1020+
}
1021+
return length32;
1022+
}
1023+
1024+
int32_t
1025+
UnicodeString::indexOf(const UChar *srcChars,
1026+
int32_t srcStart,
1027+
int32_t srcLength,
1028+
int32_t start,
1029+
int32_t length) const
1030+
{
1031+
if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1032+
return -1;
1033+
}
1034+
1035+
// UnicodeString does not find empty substrings
1036+
if(srcLength < 0 && srcChars[srcStart] == 0) {
1037+
return -1;
1038+
}
1039+
1040+
// get the indices within bounds
1041+
pinIndices(start, length);
1042+
1043+
// find the first occurrence of the substring
1044+
const UChar *array = getArrayStart();
1045+
const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1046+
if(match == NULL) {
1047+
return -1;
1048+
} else {
1049+
return (int32_t)(match - array);
1050+
}
1051+
}
1052+
1053+
int32_t
1054+
UnicodeString::doIndexOf(UChar c,
1055+
int32_t start,
1056+
int32_t length) const
1057+
{
1058+
// pin indices
1059+
pinIndices(start, length);
1060+
1061+
// find the first occurrence of c
1062+
const UChar *array = getArrayStart();
1063+
const UChar *match = u_memchr(array + start, c, length);
1064+
if(match == NULL) {
1065+
return -1;
1066+
} else {
1067+
return (int32_t)(match - array);
1068+
}
1069+
}
1070+
1071+
int32_t
1072+
UnicodeString::doIndexOf(UChar32 c,
1073+
int32_t start,
1074+
int32_t length) const {
1075+
// pin indices
1076+
pinIndices(start, length);
1077+
1078+
// find the first occurrence of c
1079+
const UChar *array = getArrayStart();
1080+
const UChar *match = u_memchr32(array + start, c, length);
1081+
if(match == NULL) {
1082+
return -1;
1083+
} else {
1084+
return (int32_t)(match - array);
1085+
}
1086+
}
1087+
1088+
int32_t
1089+
UnicodeString::lastIndexOf(const UChar *srcChars,
1090+
int32_t srcStart,
1091+
int32_t srcLength,
1092+
int32_t start,
1093+
int32_t length) const
1094+
{
1095+
if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1096+
return -1;
1097+
}
1098+
1099+
// UnicodeString does not find empty substrings
1100+
if(srcLength < 0 && srcChars[srcStart] == 0) {
1101+
return -1;
1102+
}
1103+
1104+
// get the indices within bounds
1105+
pinIndices(start, length);
1106+
1107+
// find the last occurrence of the substring
1108+
const UChar *array = getArrayStart();
1109+
const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1110+
if(match == NULL) {
1111+
return -1;
1112+
} else {
1113+
return (int32_t)(match - array);
1114+
}
1115+
}
1116+
1117+
int32_t
1118+
UnicodeString::doLastIndexOf(UChar c,
1119+
int32_t start,
1120+
int32_t length) const
1121+
{
1122+
if(isBogus()) {
1123+
return -1;
1124+
}
1125+
1126+
// pin indices
1127+
pinIndices(start, length);
1128+
1129+
// find the last occurrence of c
1130+
const UChar *array = getArrayStart();
1131+
const UChar *match = u_memrchr(array + start, c, length);
1132+
if(match == NULL) {
1133+
return -1;
1134+
} else {
1135+
return (int32_t)(match - array);
1136+
}
1137+
}
1138+
1139+
int32_t
1140+
UnicodeString::doLastIndexOf(UChar32 c,
1141+
int32_t start,
1142+
int32_t length) const {
1143+
// pin indices
1144+
pinIndices(start, length);
1145+
1146+
// find the last occurrence of c
1147+
const UChar *array = getArrayStart();
1148+
const UChar *match = u_memrchr32(array + start, c, length);
1149+
if(match == NULL) {
1150+
return -1;
1151+
} else {
1152+
return (int32_t)(match - array);
1153+
}
1154+
}
1155+
1156+
//========================================
1157+
// Write implementation
1158+
//========================================
1159+
1160+
UnicodeString&
1161+
UnicodeString::findAndReplace(int32_t start,
1162+
int32_t length,
1163+
const UnicodeString& oldText,
1164+
int32_t oldStart,
1165+
int32_t oldLength,
1166+
const UnicodeString& newText,
1167+
int32_t newStart,
1168+
int32_t newLength)
1169+
{
1170+
if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1171+
return *this;
1172+
}
1173+
1174+
pinIndices(start, length);
1175+
oldText.pinIndices(oldStart, oldLength);
1176+
newText.pinIndices(newStart, newLength);
1177+
1178+
if(oldLength == 0) {
1179+
return *this;
1180+
}
1181+
1182+
while(length > 0 && length >= oldLength) {
1183+
int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1184+
if(pos < 0) {
1185+
// no more oldText's here: done
1186+
break;
1187+
} else {
1188+
// we found oldText, replace it by newText and go beyond it
1189+
replace(pos, oldLength, newText, newStart, newLength);
1190+
length -= pos + oldLength - start;
1191+
start = pos + newLength;
1192+
}
1193+
}
1194+
1195+
return *this;
1196+
}
1197+
1198+
1199+
void
1200+
UnicodeString::setToBogus()
1201+
{
1202+
releaseArray();
1203+
1204+
fUnion.fFields.fLengthAndFlags = kIsBogus;
1205+
fUnion.fFields.fArray = 0;
1206+
fUnion.fFields.fCapacity = 0;
1207+
}
1208+
1209+
// turn a bogus string into an empty one
1210+
void
1211+
UnicodeString::unBogus() {
1212+
if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1213+
setToEmpty();
1214+
}
1215+
}
1216+
1217+
const char16_t *
1218+
UnicodeString::getTerminatedBuffer() {
1219+
if(!isWritable()) {
1220+
return nullptr;
1221+
}
1222+
UChar *array = getArrayStart();
1223+
int32_t len = length();
1224+
if(len < getCapacity()) {
1225+
if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1226+
// If len<capacity on a read-only alias, then array[len] is
1227+
// either the original NUL (if constructed with (TRUE, s, length))
1228+
// or one of the original string contents characters (if later truncated),
1229+
// therefore we can assume that array[len] is initialized memory.
1230+
if(array[len] == 0) {
1231+
return array;
1232+
}
1233+
} else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1234+
// kRefCounted: Do not write the NUL if the buffer is shared.
1235+
// That is mostly safe, except when the length of one copy was modified
1236+
// without copy-on-write, e.g., via truncate(newLength) or remove(void).
1237+
// Then the NUL would be written into the middle of another copy's string.
1238+
1239+
// Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1240+
// Do not test if there is a NUL already because it might be uninitialized memory.
1241+
// (That would be safe, but tools like valgrind & Purify would complain.)
1242+
array[len] = 0;
1243+
return array;
1244+
}
1245+
}
1246+
if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1247+
array = getArrayStart();
1248+
array[len] = 0;
1249+
return array;
1250+
} else {
1251+
return nullptr;
1252+
}
1253+
}
1254+
1255+
// setTo() analogous to the readonly-aliasing constructor with the same signature
1256+
UnicodeString &
1257+
UnicodeString::setTo(UBool isTerminated,
1258+
ConstChar16Ptr textPtr,
1259+
int32_t textLength)
1260+
{
1261+
if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1262+
// do not modify a string that has an "open" getBuffer(minCapacity)
1263+
return *this;
1264+
}
1265+
1266+
const UChar *text = textPtr;
1267+
if(text == NULL) {
1268+
// treat as an empty string, do not alias
1269+
releaseArray();
1270+
setToEmpty();
1271+
return *this;
1272+
}
1273+
1274+
if( textLength < -1 ||
1275+
(textLength == -1 && !isTerminated) ||
1276+
(textLength >= 0 && isTerminated && text[textLength] != 0)
1277+
) {
1278+
setToBogus();
1279+
return *this;
1280+
}
1281+
1282+
releaseArray();
1283+
1284+
if(textLength == -1) {
1285+
// text is terminated, or else it would have failed the above test
1286+
textLength = u_strlen(text);
1287+
}
1288+
fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289+
setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1290+
return *this;
1291+
}
1292+
1293+
// setTo() analogous to the writable-aliasing constructor with the same signature
1294+
UnicodeString &
1295+
UnicodeString::setTo(UChar *buffer,
1296+
int32_t buffLength,
1297+
int32_t buffCapacity) {
1298+
if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299+
// do not modify a string that has an "open" getBuffer(minCapacity)
1300+
return *this;
1301+
}
1302+
1303+
if(buffer == NULL) {
1304+
// treat as an empty string, do not alias
1305+
releaseArray();
1306+
setToEmpty();
1307+
return *this;
1308+
}
1309+
1310+
if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1311+
setToBogus();
1312+
return *this;
1313+
} else if(buffLength == -1) {
1314+
// buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315+
const UChar *p = buffer, *limit = buffer + buffCapacity;
1316+
while(p != limit && *p != 0) {
1317+
++p;
1318+
}
1319+
buffLength = (int32_t)(p - buffer);
1320+
}
1321+
1322+
releaseArray();
1323+
1324+
fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325+
setArray(buffer, buffLength, buffCapacity);
1326+
return *this;
1327+
}
1328+
1329+
UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330+
unBogus();
1331+
int32_t length = utf8.length();
1332+
int32_t capacity;
1333+
// The UTF-16 string will be at most as long as the UTF-8 string.
1334+
if(length <= US_STACKBUF_SIZE) {
1335+
capacity = US_STACKBUF_SIZE;
1336+
} else {
1337+
capacity = length + 1; // +1 for the terminating NUL.
1338+
}
1339+
UChar *utf16 = getBuffer(capacity);
1340+
int32_t length16;
1341+
UErrorCode errorCode = U_ZERO_ERROR;
1342+
u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343+
utf8.data(), length,
1344+
0xfffd, // Substitution character.
1345+
NULL, // Don't care about number of substitutions.
1346+
&errorCode);
1347+
releaseBuffer(length16);
1348+
if(U_FAILURE(errorCode)) {
1349+
setToBogus();
1350+
}
1351+
return *this;
1352+
}
1353+
1354+
UnicodeString&
1355+
UnicodeString::setCharAt(int32_t offset,
1356+
UChar c)
1357+
{
1358+
int32_t len = length();
1359+
if(cloneArrayIfNeeded() && len > 0) {
1360+
if(offset < 0) {
1361+
offset = 0;
1362+
} else if(offset >= len) {
1363+
offset = len - 1;
1364+
}
1365+
1366+
getArrayStart()[offset] = c;
1367+
}
1368+
return *this;
1369+
}
1370+
1371+
UnicodeString&
1372+
UnicodeString::replace(int32_t start,
1373+
int32_t _length,
1374+
UChar32 srcChar) {
1375+
UChar buffer[U16_MAX_LENGTH];
1376+
int32_t count = 0;
1377+
UBool isError = FALSE;
1378+
U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379+
// We test isError so that the compiler does not complain that we don't.
1380+
// If isError (srcChar is not a valid code point) then count==0 which means
1381+
// we remove the source segment rather than replacing it with srcChar.
1382+
return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1383+
}
1384+
1385+
UnicodeString&
1386+
UnicodeString::append(UChar32 srcChar) {
1387+
UChar buffer[U16_MAX_LENGTH];
1388+
int32_t _length = 0;
1389+
UBool isError = FALSE;
1390+
U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391+
// We test isError so that the compiler does not complain that we don't.
1392+
// If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393+
return isError ? *this : doAppend(buffer, 0, _length);
1394+
}
1395+
1396+
UnicodeString&
1397+
UnicodeString::doReplace( int32_t start,
1398+
int32_t length,
1399+
const UnicodeString& src,
1400+
int32_t srcStart,
1401+
int32_t srcLength)
1402+
{
1403+
// pin the indices to legal values
1404+
src.pinIndices(srcStart, srcLength);
1405+
1406+
// get the characters from src
1407+
// and replace the range in ourselves with them
1408+
return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409+
}
1410+
1411+
UnicodeString&
1412+
UnicodeString::doReplace(int32_t start,
1413+
int32_t length,
1414+
const UChar *srcChars,
1415+
int32_t srcStart,
1416+
int32_t srcLength)
1417+
{
1418+
if(!isWritable()) {
1419+
return *this;
1420+
}
1421+
1422+
int32_t oldLength = this->length();
1423+
1424+
// optimize (read-only alias).remove(0, start) and .remove(start, end)
1425+
if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1426+
if(start == 0) {
1427+
// remove prefix by adjusting the array pointer
1428+
pinIndex(length);
1429+
fUnion.fFields.fArray += length;
1430+
fUnion.fFields.fCapacity -= length;
1431+
setLength(oldLength - length);
1432+
return *this;
1433+
} else {
1434+
pinIndex(start);
1435+
if(length >= (oldLength - start)) {
1436+
// remove suffix by reducing the length (like truncate())
1437+
setLength(start);
1438+
fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1439+
return *this;
1440+
}
1441+
}
1442+
}
1443+
1444+
if(start == oldLength) {
1445+
return doAppend(srcChars, srcStart, srcLength);
1446+
}
1447+
1448+
if(srcChars == 0) {
1449+
srcLength = 0;
1450+
} else {
1451+
// Perform all remaining operations relative to srcChars + srcStart.
1452+
// From this point forward, do not use srcStart.
1453+
srcChars += srcStart;
1454+
if (srcLength < 0) {
1455+
// get the srcLength if necessary
1456+
srcLength = u_strlen(srcChars);
1457+
}
1458+
}
1459+
1460+
// pin the indices to legal values
1461+
pinIndices(start, length);
1462+
1463+
// Calculate the size of the string after the replace.
1464+
// Avoid int32_t overflow.
1465+
int32_t newLength = oldLength - length;
1466+
if(srcLength > (INT32_MAX - newLength)) {
1467+
setToBogus();
1468+
return *this;
1469+
}
1470+
newLength += srcLength;
1471+
1472+
// Check for insertion into ourself
1473+
const UChar *oldArray = getArrayStart();
1474+
if (isBufferWritable() &&
1475+
oldArray < srcChars + srcLength &&
1476+
srcChars < oldArray + oldLength) {
1477+
// Copy into a new UnicodeString and start over
1478+
UnicodeString copy(srcChars, srcLength);
1479+
if (copy.isBogus()) {
1480+
setToBogus();
1481+
return *this;
1482+
}
1483+
return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1484+
}
1485+
1486+
// cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1487+
// therefore we need to keep the current fArray
1488+
UChar oldStackBuffer[US_STACKBUF_SIZE];
1489+
if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1490+
// copy the stack buffer contents because it will be overwritten with
1491+
// fUnion.fFields values
1492+
u_memcpy(oldStackBuffer, oldArray, oldLength);
1493+
oldArray = oldStackBuffer;
1494+
}
1495+
1496+
// clone our array and allocate a bigger array if needed
1497+
int32_t *bufferToDelete = 0;
1498+
if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1499+
FALSE, &bufferToDelete)
1500+
) {
1501+
return *this;
1502+
}
1503+
1504+
// now do the replace
1505+
1506+
UChar *newArray = getArrayStart();
1507+
if(newArray != oldArray) {
1508+
// if fArray changed, then we need to copy everything except what will change
1509+
us_arrayCopy(oldArray, 0, newArray, 0, start);
1510+
us_arrayCopy(oldArray, start + length,
1511+
newArray, start + srcLength,
1512+
oldLength - (start + length));
1513+
} else if(length != srcLength) {
1514+
// fArray did not change; copy only the portion that isn't changing, leaving a hole
1515+
us_arrayCopy(oldArray, start + length,
1516+
newArray, start + srcLength,
1517+
oldLength - (start + length));
1518+
}
1519+
1520+
// now fill in the hole with the new string
1521+
us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1522+
1523+
setLength(newLength);
1524+
1525+
// delayed delete in case srcChars == fArray when we started, and
1526+
// to keep oldArray alive for the above operations
1527+
if (bufferToDelete) {
1528+
uprv_free(bufferToDelete);
1529+
}
1530+
1531+
return *this;
1532+
}
1533+
1534+
// Versions of doReplace() only for append() variants.
1535+
// doReplace() and doAppend() optimize for different cases.
1536+
1537+
UnicodeString&
1538+
UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1539+
if(srcLength == 0) {
1540+
return *this;
1541+
}
1542+
1543+
// pin the indices to legal values
1544+
src.pinIndices(srcStart, srcLength);
1545+
return doAppend(src.getArrayStart(), srcStart, srcLength);
1546+
}
1547+
1548+
UnicodeString&
1549+
UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1550+
if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1551+
return *this;
1552+
}
1553+
1554+
// Perform all remaining operations relative to srcChars + srcStart.
1555+
// From this point forward, do not use srcStart.
1556+
srcChars += srcStart;
1557+
1558+
if(srcLength < 0) {
1559+
// get the srcLength if necessary
1560+
if((srcLength = u_strlen(srcChars)) == 0) {
1561+
return *this;
1562+
}
1563+
}
1564+
1565+
int32_t oldLength = length();
1566+
int32_t newLength;
1567+
if (uprv_add32_overflow(oldLength, srcLength, &newLength)) {
1568+
setToBogus();
1569+
return *this;
1570+
}
1571+
1572+
// Check for append onto ourself
1573+
const UChar* oldArray = getArrayStart();
1574+
if (isBufferWritable() &&
1575+
oldArray < srcChars + srcLength &&
1576+
srcChars < oldArray + oldLength) {
1577+
// Copy into a new UnicodeString and start over
1578+
UnicodeString copy(srcChars, srcLength);
1579+
if (copy.isBogus()) {
1580+
setToBogus();
1581+
return *this;
1582+
}
1583+
return doAppend(copy.getArrayStart(), 0, srcLength);
1584+
}
1585+
1586+
// optimize append() onto a large-enough, owned string
1587+
if((newLength <= getCapacity() && isBufferWritable()) ||
1588+
cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1589+
UChar *newArray = getArrayStart();
1590+
// Do not copy characters when
1591+
// UChar *buffer=str.getAppendBuffer(...);
1592+
// is followed by
1593+
// str.append(buffer, length);
1594+
// or
1595+
// str.appendString(buffer, length)
1596+
// or similar.
1597+
if(srcChars != newArray + oldLength) {
1598+
us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1599+
}
1600+
setLength(newLength);
1601+
}
1602+
return *this;
1603+
}
1604+
1605+
/**
1606+
* Replaceable API
1607+
*/
1608+
void
1609+
UnicodeString::handleReplaceBetween(int32_t start,
1610+
int32_t limit,
1611+
const UnicodeString& text) {
1612+
replaceBetween(start, limit, text);
1613+
}
1614+
1615+
/**
1616+
* Replaceable API
1617+
*/
1618+
void
1619+
UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1620+
if (limit <= start) {
1621+
return; // Nothing to do; avoid bogus malloc call
1622+
}
1623+
UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1624+
// Check to make sure text is not null.
1625+
if (text != NULL) {
1626+
extractBetween(start, limit, text, 0);
1627+
insert(dest, text, 0, limit - start);
1628+
uprv_free(text);
1629+
}
1630+
}
1631+
1632+
/**
1633+
* Replaceable API
1634+
*
1635+
* NOTE: This is for the Replaceable class. There is no rep.cpp,
1636+
* so we implement this function here.
1637+
*/
1638+
UBool Replaceable::hasMetaData() const {
1639+
return TRUE;
1640+
}
1641+
1642+
/**
1643+
* Replaceable API
1644+
*/
1645+
UBool UnicodeString::hasMetaData() const {
1646+
return FALSE;
1647+
}
1648+
1649+
UnicodeString&
1650+
UnicodeString::doReverse(int32_t start, int32_t length) {
1651+
if(length <= 1 || !cloneArrayIfNeeded()) {
1652+
return *this;
1653+
}
1654+
1655+
// pin the indices to legal values
1656+
pinIndices(start, length);
1657+
if(length <= 1) { // pinIndices() might have shrunk the length
1658+
return *this;
1659+
}
1660+
1661+
UChar *left = getArrayStart() + start;
1662+
UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1663+
UChar swap;
1664+
UBool hasSupplementary = FALSE;
1665+
1666+
// Before the loop we know left<right because length>=2.
1667+
do {
1668+
hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1669+
hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1670+
*right-- = swap;
1671+
} while(left < right);
1672+
// Make sure to test the middle code unit of an odd-length string.
1673+
// Redundant if the length is even.
1674+
hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1675+
1676+
/* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1677+
if(hasSupplementary) {
1678+
UChar swap2;
1679+
1680+
left = getArrayStart() + start;
1681+
right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1682+
while(left < right) {
1683+
if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1684+
*left++ = swap2;
1685+
*left++ = swap;
1686+
} else {
1687+
++left;
1688+
}
1689+
}
1690+
}
1691+
1692+
return *this;
1693+
}
1694+
1695+
UBool
1696+
UnicodeString::padLeading(int32_t targetLength,
1697+
UChar padChar)
1698+
{
1699+
int32_t oldLength = length();
1700+
if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1701+
return FALSE;
1702+
} else {
1703+
// move contents up by padding width
1704+
UChar *array = getArrayStart();
1705+
int32_t start = targetLength - oldLength;
1706+
us_arrayCopy(array, 0, array, start, oldLength);
1707+
1708+
// fill in padding character
1709+
while(--start >= 0) {
1710+
array[start] = padChar;
1711+
}
1712+
setLength(targetLength);
1713+
return TRUE;
1714+
}
1715+
}
1716+
1717+
UBool
1718+
UnicodeString::padTrailing(int32_t targetLength,
1719+
UChar padChar)
1720+
{
1721+
int32_t oldLength = length();
1722+
if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1723+
return FALSE;
1724+
} else {
1725+
// fill in padding character
1726+
UChar *array = getArrayStart();
1727+
int32_t length = targetLength;
1728+
while(--length >= oldLength) {
1729+
array[length] = padChar;
1730+
}
1731+
setLength(targetLength);
1732+
return TRUE;
1733+
}
1734+
}
1735+
1736+
//========================================
1737+
// Hashing
1738+
//========================================
1739+
int32_t
1740+
UnicodeString::doHashCode() const
1741+
{
1742+
/* Delegate hash computation to uhash. This makes UnicodeString
1743+
* hashing consistent with UChar* hashing. */
1744+
int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1745+
if (hashCode == kInvalidHashCode) {
1746+
hashCode = kEmptyHashCode;
1747+
}
1748+
return hashCode;
1749+
}
1750+
1751+
//========================================
1752+
// External Buffer
1753+
//========================================
1754+
1755+
char16_t *
1756+
UnicodeString::getBuffer(int32_t minCapacity) {
1757+
if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1758+
fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1759+
setZeroLength();
1760+
return getArrayStart();
1761+
} else {
1762+
return nullptr;
1763+
}
1764+
}
1765+
1766+
void
1767+
UnicodeString::releaseBuffer(int32_t newLength) {
1768+
if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1769+
// set the new fLength
1770+
int32_t capacity=getCapacity();
1771+
if(newLength==-1) {
1772+
// the new length is the string length, capped by fCapacity
1773+
const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1774+
while(p<limit && *p!=0) {
1775+
++p;
1776+
}
1777+
newLength=(int32_t)(p-array);
1778+
} else if(newLength>capacity) {
1779+
newLength=capacity;
1780+
}
1781+
setLength(newLength);
1782+
fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1783+
}
1784+
}
1785+
1786+
//========================================
1787+
// Miscellaneous
1788+
//========================================
1789+
UBool
1790+
UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1791+
int32_t growCapacity,
1792+
UBool doCopyArray,
1793+
int32_t **pBufferToDelete,
1794+
UBool forceClone) {
1795+
// default parameters need to be static, therefore
1796+
// the defaults are -1 to have convenience defaults
1797+
if(newCapacity == -1) {
1798+
newCapacity = getCapacity();
1799+
}
1800+
1801+
// while a getBuffer(minCapacity) is "open",
1802+
// prevent any modifications of the string by returning FALSE here
1803+
// if the string is bogus, then only an assignment or similar can revive it
1804+
if(!isWritable()) {
1805+
return FALSE;
1806+
}
1807+
1808+
/*
1809+
* We need to make a copy of the array if
1810+
* the buffer is read-only, or
1811+
* the buffer is refCounted (shared), and refCount>1, or
1812+
* the buffer is too small.
1813+
* Return FALSE if memory could not be allocated.
1814+
*/
1815+
if(forceClone ||
1816+
fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1817+
(fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1818+
newCapacity > getCapacity()
1819+
) {
1820+
// check growCapacity for default value and use of the stack buffer
1821+
if(growCapacity < 0) {
1822+
growCapacity = newCapacity;
1823+
} else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1824+
growCapacity = US_STACKBUF_SIZE;
1825+
}
1826+
1827+
// save old values
1828+
UChar oldStackBuffer[US_STACKBUF_SIZE];
1829+
UChar *oldArray;
1830+
int32_t oldLength = length();
1831+
int16_t flags = fUnion.fFields.fLengthAndFlags;
1832+
1833+
if(flags&kUsingStackBuffer) {
1834+
U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1835+
if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1836+
// copy the stack buffer contents because it will be overwritten with
1837+
// fUnion.fFields values
1838+
us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1839+
oldArray = oldStackBuffer;
1840+
} else {
1841+
oldArray = NULL; // no need to copy from the stack buffer to itself
1842+
}
1843+
} else {
1844+
oldArray = fUnion.fFields.fArray;
1845+
U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1846+
}
1847+
1848+
// allocate a new array
1849+
if(allocate(growCapacity) ||
1850+
(newCapacity < growCapacity && allocate(newCapacity))
1851+
) {
1852+
if(doCopyArray) {
1853+
// copy the contents
1854+
// do not copy more than what fits - it may be smaller than before
1855+
int32_t minLength = oldLength;
1856+
newCapacity = getCapacity();
1857+
if(newCapacity < minLength) {
1858+
minLength = newCapacity;
1859+
}
1860+
if(oldArray != NULL) {
1861+
us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1862+
}
1863+
setLength(minLength);
1864+
} else {
1865+
setZeroLength();
1866+
}
1867+
1868+
// release the old array
1869+
if(flags & kRefCounted) {
1870+
// the array is refCounted; decrement and release if 0
1871+
u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1872+
if(umtx_atomic_dec(pRefCount) == 0) {
1873+
if(pBufferToDelete == 0) {
1874+
// Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1875+
// is defined as volatile. (Volatile has useful non-standard behavior
1876+
// with this compiler.)
1877+
uprv_free((void *)pRefCount);
1878+
} else {
1879+
// the caller requested to delete it himself
1880+
*pBufferToDelete = (int32_t *)pRefCount;
1881+
}
1882+
}
1883+
}
1884+
} else {
1885+
// not enough memory for growCapacity and not even for the smaller newCapacity
1886+
// reset the old values for setToBogus() to release the array
1887+
if(!(flags&kUsingStackBuffer)) {
1888+
fUnion.fFields.fArray = oldArray;
1889+
}
1890+
fUnion.fFields.fLengthAndFlags = flags;
1891+
setToBogus();
1892+
return FALSE;
1893+
}
1894+
}
1895+
return TRUE;
1896+
}
1897+
1898+
// UnicodeStringAppendable ------------------------------------------------- ***
1899+
1900+
UnicodeStringAppendable::~UnicodeStringAppendable() {}
1901+
1902+
UBool
1903+
UnicodeStringAppendable::appendCodeUnit(UChar c) {
1904+
return str.doAppend(&c, 0, 1).isWritable();
1905+
}
1906+
1907+
UBool
1908+
UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1909+
UChar buffer[U16_MAX_LENGTH];
1910+
int32_t cLength = 0;
1911+
UBool isError = FALSE;
1912+
U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1913+
return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1914+
}
1915+
1916+
UBool
1917+
UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1918+
return str.doAppend(s, 0, length).isWritable();
1919+
}
1920+
1921+
UBool
1922+
UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1923+
return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1924+
}
1925+
1926+
UChar *
1927+
UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1928+
int32_t desiredCapacityHint,
1929+
UChar *scratch, int32_t scratchCapacity,
1930+
int32_t *resultCapacity) {
1931+
if(minCapacity < 1 || scratchCapacity < minCapacity) {
1932+
*resultCapacity = 0;
1933+
return NULL;
1934+
}
1935+
int32_t oldLength = str.length();
1936+
if(minCapacity <= (kMaxCapacity - oldLength) &&
1937+
desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1938+
str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1939+
*resultCapacity = str.getCapacity() - oldLength;
1940+
return str.getArrayStart() + oldLength;
1941+
}
1942+
*resultCapacity = scratchCapacity;
1943+
return scratch;
1944+
}
1945+
1946+
U_NAMESPACE_END
1947+
1948+
U_NAMESPACE_USE
1949+
1950+
U_CAPI int32_t U_EXPORT2
1951+
uhash_hashUnicodeString(const UElement key) {
1952+
const UnicodeString *str = (const UnicodeString*) key.pointer;
1953+
return (str == NULL) ? 0 : str->hashCode();
1954+
}
1955+
1956+
// Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1957+
// does not depend on hashtable code.
1958+
U_CAPI UBool U_EXPORT2
1959+
uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1960+
const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1961+
const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1962+
if (str1 == str2) {
1963+
return TRUE;
1964+
}
1965+
if (str1 == NULL || str2 == NULL) {
1966+
return FALSE;
1967+
}
1968+
return *str1 == *str2;
1969+
}
1970+
1971+
#ifdef U_STATIC_IMPLEMENTATION
1972+
/*
1973+
This should never be called. It is defined here to make sure that the
1974+
virtual vector deleting destructor is defined within unistr.cpp.
1975+
The vector deleting destructor is already a part of UObject,
1976+
but defining it here makes sure that it is included with this object file.
1977+
This makes sure that static library dependencies are kept to a minimum.
1978+
*/
1979+
static void uprv_UnicodeStringDummy(void) {
1980+
delete [] (new UnicodeString[2]);
1981+
}
1982+
#endif

0 commit comments

Comments
 (0)
Please sign in to comment.