MyGUI  3.2.1
MyGUI_UString.cpp
Go to the documentation of this file.
1 /*
2  * This source file is part of MyGUI. For the latest info, see http://mygui.info/
3  * Distributed under the MIT License
4  * (See accompanying file COPYING.MIT or copy at http://opensource.org/licenses/MIT)
5  */
6 
7 #include "MyGUI_Precompiled.h"
8 #include "MyGUI_UString.h"
9 
10 namespace MyGUI
11 {
12 
13  //--------------------------------------------------------------------------
15  {
16  mString = 0;
17  }
18  //--------------------------------------------------------------------------
20  {
21  mIter += c;
22  }
23  //--------------------------------------------------------------------------
25  {
26  mIter -= c;
27  }
28  //--------------------------------------------------------------------------
30  {
31  mIter = i.mIter;
32  mString = i.mString;
33  }
34  //--------------------------------------------------------------------------
36  {
37  return mIter == mString->mData.begin();
38  }
39  //--------------------------------------------------------------------------
41  {
42  return mIter == mString->mData.end();
43  }
44  //--------------------------------------------------------------------------
46  {
47  return mIter - mString->mData.begin();
48  }
49  //--------------------------------------------------------------------------
51  {
52  mIter = mString->mData.begin() + index;
53  }
54  //--------------------------------------------------------------------------
56  {
57  size_type current_index = _get_index();
58  return mString->getChar( current_index );
59  }
60  //--------------------------------------------------------------------------
62  {
63  size_type current_index = _get_index();
64  int change = mString->setChar( current_index, uc );
65  _jump_to( current_index );
66  return change;
67  }
68  //--------------------------------------------------------------------------
70  {
71  _seekFwd( 1 ); // move 1 code point forward
72  if ( _test_end() ) return; // exit if we hit the end
73  if ( _utf16_surrogate_follow( mIter[0] ) ) {
74  // landing on a follow code point means we might be part of a bigger character
75  // so we test for that
76  code_point lead_half = 0;
77  //NB: we can't possibly be at the beginning here, so no need to test
78  lead_half = mIter[-1]; // check the previous code point to see if we're part of a surrogate pair
79  if ( _utf16_surrogate_lead( lead_half ) ) {
80  _seekFwd( 1 ); // if so, then advance 1 more code point
81  }
82  }
83  }
84  //--------------------------------------------------------------------------
86  {
87  _seekRev( 1 ); // move 1 code point backwards
88  if ( _test_begin() ) return; // exit if we hit the beginning
89  if ( _utf16_surrogate_follow( mIter[0] ) ) {
90  // landing on a follow code point means we might be part of a bigger character
91  // so we test for that
92  code_point lead_half = 0;
93  lead_half = mIter[-1]; // check the previous character to see if we're part of a surrogate pair
94  if ( _utf16_surrogate_lead( lead_half ) ) {
95  _seekRev( 1 ); // if so, then rewind 1 more code point
96  }
97  }
98  }
99  //--------------------------------------------------------------------------
100  //--------------------------------------------------------------------------
101  //--------------------------------------------------------------------------
102  //--------------------------------------------------------------------------
104  {
105 
106  }
107  //--------------------------------------------------------------------------
109  {
110  _become( i );
111  }
112  //--------------------------------------------------------------------------
114  {
115  _seekFwd( 1 );
116  return *this;
117  }
118  //--------------------------------------------------------------------------
120  {
121  _fwd_iterator tmp( *this );
122  _seekFwd( 1 );
123  return tmp;
124  }
125  //--------------------------------------------------------------------------
127  {
128  _seekRev( 1 );
129  return *this;
130  }
131  //--------------------------------------------------------------------------
133  {
134  _fwd_iterator tmp( *this );
135  _seekRev( 1 );
136  return tmp;
137  }
138  //--------------------------------------------------------------------------
140  {
141  _fwd_iterator tmp( *this );
142  if ( n < 0 )
143  tmp._seekRev( -n );
144  else
145  tmp._seekFwd( n );
146  return tmp;
147  }
148  //--------------------------------------------------------------------------
150  {
151  _fwd_iterator tmp( *this );
152  if ( n < 0 )
153  tmp._seekFwd( -n );
154  else
155  tmp._seekRev( n );
156  return tmp;
157  }
158  //--------------------------------------------------------------------------
160  {
161  if ( n < 0 )
162  _seekRev( -n );
163  else
164  _seekFwd( n );
165  return *this;
166  }
167  //--------------------------------------------------------------------------
169  {
170  if ( n < 0 )
171  _seekFwd( -n );
172  else
173  _seekRev( n );
174  return *this;
175  }
176  //--------------------------------------------------------------------------
178  {
179  return *mIter;
180  }
181  //--------------------------------------------------------------------------
183  {
184  _fwd_iterator tmp( *this );
185  tmp += n;
186  return *tmp;
187  }
188  //--------------------------------------------------------------------------
190  {
191  _moveNext();
192  return *this;
193  }
194  //--------------------------------------------------------------------------
196  {
197  _movePrev();
198  return *this;
199  }
200  //--------------------------------------------------------------------------
202  {
203  return _getCharacter();
204  }
205  //--------------------------------------------------------------------------
207  {
208  return _setCharacter( uc );
209  }
210  //--------------------------------------------------------------------------
211  //--------------------------------------------------------------------------
212  //--------------------------------------------------------------------------
213  //--------------------------------------------------------------------------
215  {
216 
217  }
218  //--------------------------------------------------------------------------
220  {
221  _become( i );
222  }
223  //--------------------------------------------------------------------------
225  {
226  _become( i );
227  }
228  //--------------------------------------------------------------------------
230  {
231  _seekFwd( 1 );
232  return *this;
233  }
234  //--------------------------------------------------------------------------
236  {
237  _const_fwd_iterator tmp( *this );
238  _seekFwd( 1 );
239  return tmp;
240  }
241  //--------------------------------------------------------------------------
243  {
244  _seekRev( 1 );
245  return *this;
246  }
247  //--------------------------------------------------------------------------
249  {
250  _const_fwd_iterator tmp( *this );
251  _seekRev( 1 );
252  return tmp;
253  }
254  //--------------------------------------------------------------------------
256  {
257  _const_fwd_iterator tmp( *this );
258  if ( n < 0 )
259  tmp._seekRev( -n );
260  else
261  tmp._seekFwd( n );
262  return tmp;
263  }
264  //--------------------------------------------------------------------------
266  {
267  _const_fwd_iterator tmp( *this );
268  if ( n < 0 )
269  tmp._seekFwd( -n );
270  else
271  tmp._seekRev( n );
272  return tmp;
273  }
274  //--------------------------------------------------------------------------
276  {
277  if ( n < 0 )
278  _seekRev( -n );
279  else
280  _seekFwd( n );
281  return *this;
282  }
283  //--------------------------------------------------------------------------
285  {
286  if ( n < 0 )
287  _seekFwd( -n );
288  else
289  _seekRev( n );
290  return *this;
291  }
292  //--------------------------------------------------------------------------
294  {
295  return *mIter;
296  }
297  //--------------------------------------------------------------------------
299  {
300  _const_fwd_iterator tmp( *this );
301  tmp += n;
302  return *tmp;
303  }
304  //--------------------------------------------------------------------------
306  {
307  _moveNext();
308  return *this;
309  }
310  //--------------------------------------------------------------------------
312  {
313  _movePrev();
314  return *this;
315  }
316  //--------------------------------------------------------------------------
318  {
319  return _getCharacter();
320  }
321  //--------------------------------------------------------------------------
322  //--------------------------------------------------------------------------
323  //--------------------------------------------------------------------------
324  //--------------------------------------------------------------------------
326  {
327 
328  }
329  //--------------------------------------------------------------------------
331  {
332  _become( i );
333  }
334  //--------------------------------------------------------------------------
336  {
337  _seekRev( 1 );
338  return *this;
339  }
340  //--------------------------------------------------------------------------
342  {
343  _rev_iterator tmp( *this );
344  _seekRev( 1 );
345  return tmp;
346  }
347  //--------------------------------------------------------------------------
349  {
350  _seekFwd( 1 );
351  return *this;
352  }
353  //--------------------------------------------------------------------------
355  {
356  _rev_iterator tmp( *this );
357  _seekFwd( 1 );
358  return tmp;
359  }
360  //--------------------------------------------------------------------------
362  {
363  _rev_iterator tmp( *this );
364  if ( n < 0 )
365  tmp._seekFwd( -n );
366  else
367  tmp._seekRev( n );
368  return tmp;
369  }
370  //--------------------------------------------------------------------------
372  {
373  _rev_iterator tmp( *this );
374  if ( n < 0 )
375  tmp._seekRev( -n );
376  else
377  tmp._seekFwd( n );
378  return tmp;
379  }
380  //--------------------------------------------------------------------------
382  {
383  if ( n < 0 )
384  _seekFwd( -n );
385  else
386  _seekRev( n );
387  return *this;
388  }
389  //--------------------------------------------------------------------------
391  {
392  if ( n < 0 )
393  _seekRev( -n );
394  else
395  _seekFwd( n );
396  return *this;
397  }
398  //--------------------------------------------------------------------------
400  {
401  return mIter[-1];
402  }
403  //--------------------------------------------------------------------------
405  {
406  _rev_iterator tmp( *this );
407  tmp -= n;
408  return *tmp;
409  }
410  //--------------------------------------------------------------------------
411  //--------------------------------------------------------------------------
412  //--------------------------------------------------------------------------
413  //--------------------------------------------------------------------------
415  {
416 
417  }
418  //--------------------------------------------------------------------------
420  {
421  _become( i );
422  }
423  //--------------------------------------------------------------------------
425  {
426  _become( i );
427  }
428  //--------------------------------------------------------------------------
430  {
431  _seekRev( 1 );
432  return *this;
433  }
434  //--------------------------------------------------------------------------
436  {
437  _const_rev_iterator tmp( *this );
438  _seekRev( 1 );
439  return tmp;
440  }
441  //--------------------------------------------------------------------------
443  {
444  _seekFwd( 1 );
445  return *this;
446  }
447  //--------------------------------------------------------------------------
449  {
450  _const_rev_iterator tmp( *this );
451  _seekFwd( 1 );
452  return tmp;
453  }
454  //--------------------------------------------------------------------------
456  {
457  _const_rev_iterator tmp( *this );
458  if ( n < 0 )
459  tmp._seekFwd( -n );
460  else
461  tmp._seekRev( n );
462  return tmp;
463  }
464  //--------------------------------------------------------------------------
466  {
467  _const_rev_iterator tmp( *this );
468  if ( n < 0 )
469  tmp._seekRev( -n );
470  else
471  tmp._seekFwd( n );
472  return tmp;
473  }
474  //--------------------------------------------------------------------------
476  {
477  if ( n < 0 )
478  _seekFwd( -n );
479  else
480  _seekRev( n );
481  return *this;
482  }
483  //--------------------------------------------------------------------------
485  {
486  if ( n < 0 )
487  _seekRev( -n );
488  else
489  _seekFwd( n );
490  return *this;
491  }
492  //--------------------------------------------------------------------------
494  {
495  return mIter[-1];
496  }
497  //--------------------------------------------------------------------------
499  {
500  _const_rev_iterator tmp( *this );
501  tmp -= n;
502  return *tmp;
503  }
504  //--------------------------------------------------------------------------
505  //--------------------------------------------------------------------------
506  //--------------------------------------------------------------------------
507  //--------------------------------------------------------------------------
509  {
510  _init();
511  }
512  //--------------------------------------------------------------------------
513  UString::UString( const UString& copy )
514  {
515  _init();
516  mData = copy.mData;
517  }
518  //--------------------------------------------------------------------------
520  {
521  _init();
522  assign( length, ch );
523  }
524  //--------------------------------------------------------------------------
526  {
527  _init();
528  assign( str );
529  }
530  //--------------------------------------------------------------------------
532  {
533  _init();
534  assign( str, length );
535  }
536  //--------------------------------------------------------------------------
538  {
539  _init();
540  assign( str, index, length );
541  }
542  //--------------------------------------------------------------------------
543 #if MYGUI_IS_NATIVE_WCHAR_T
544  UString::UString( const wchar_t* w_str )
545  {
546  _init();
547  assign( w_str );
548  }
549  //--------------------------------------------------------------------------
550  UString::UString( const wchar_t* w_str, size_type length )
551  {
552  _init();
553  assign( w_str, length );
554  }
555 #endif
556  //--------------------------------------------------------------------------
557  UString::UString( const std::wstring& wstr )
558  {
559  _init();
560  assign( wstr );
561  }
562  //--------------------------------------------------------------------------
563  UString::UString( const char* c_str )
564  {
565  _init();
566  assign( c_str );
567  }
568  //--------------------------------------------------------------------------
570  {
571  _init();
572  assign( c_str, length );
573  }
574  //--------------------------------------------------------------------------
575  UString::UString( const std::string& str )
576  {
577  _init();
578  assign( str );
579  }
580  //--------------------------------------------------------------------------
582  {
583  _cleanBuffer();
584  }
585  //--------------------------------------------------------------------------
587  {
588  return mData.size();
589  }
590  //--------------------------------------------------------------------------
592  {
593  return size();
594  }
595  //--------------------------------------------------------------------------
597  {
598  const_iterator i = begin(), ie = end();
599  size_type c = 0;
600  while ( i != ie ) {
601  i.moveNext();
602  ++c;
603  }
604  return c;
605  }
606  //--------------------------------------------------------------------------
608  {
609  return mData.max_size();
610  }
611  //--------------------------------------------------------------------------
613  {
614  mData.reserve( size );
615  }
616  //--------------------------------------------------------------------------
617  void UString::resize( size_type num, const code_point& val /*= 0 */ )
618  {
619  mData.resize( num, val );
620  }
621  //--------------------------------------------------------------------------
622  void UString::swap( UString& from )
623  {
624  mData.swap( from.mData );
625  }
626  //--------------------------------------------------------------------------
627  bool UString::empty() const
628  {
629  return mData.empty();
630  }
631  //--------------------------------------------------------------------------
633  {
634  return mData.c_str();
635  }
636  //--------------------------------------------------------------------------
638  {
639  return c_str();
640  }
641  //--------------------------------------------------------------------------
643  {
644  return mData.capacity();
645  }
646  //--------------------------------------------------------------------------
648  {
649  mData.clear();
650  }
651  //--------------------------------------------------------------------------
652  UString UString::substr( size_type index, size_type num /*= npos */ ) const
653  {
654  // this could avoid the extra copy if we used a private specialty constructor
655  dstring data = mData.substr( index, num );
656  UString tmp;
657  tmp.mData.swap( data );
658  return tmp;
659  }
660  //--------------------------------------------------------------------------
662  {
663  code_point cp[2];
664  size_t c = _utf32_to_utf16( val, cp );
665  if ( c > 0 ) push_back( cp[0] );
666  if ( c > 1 ) push_back( cp[1] );
667  }
668  //--------------------------------------------------------------------------
669 #if MYGUI_IS_NATIVE_WCHAR_T
670  void UString::push_back( wchar_t val )
671  {
672  // we do this because the Unicode method still preserves UTF-16 code points
673  mData.push_back( static_cast<code_point>( val ) );
674  }
675 #endif
676  //--------------------------------------------------------------------------
678  {
679  mData.push_back( val );
680  }
681 
682  void UString::push_back( char val )
683  {
684  mData.push_back( static_cast<code_point>( val ) );
685  }
686 
688  {
689  const_iterator i, ie = end();
690  for ( i = begin(); i != ie; i.moveNext() ) {
691  if ( i.getCharacter() == ch )
692  return true;
693  }
694  return false;
695  }
696 
697  const std::string& UString::asUTF8() const
698  {
699  _load_buffer_UTF8();
700  return *m_buffer.mStrBuffer;
701  }
702 
703  const char* UString::asUTF8_c_str() const
704  {
705  _load_buffer_UTF8();
706  return m_buffer.mStrBuffer->c_str();
707  }
708 
710  {
711  _load_buffer_UTF32();
712  return *m_buffer.mUTF32StrBuffer;
713  }
714 
716  {
717  _load_buffer_UTF32();
718  return m_buffer.mUTF32StrBuffer->c_str();
719  }
720 
721  const std::wstring& UString::asWStr() const
722  {
723  _load_buffer_WStr();
724  return *m_buffer.mWStrBuffer;
725  }
726 
727  const wchar_t* UString::asWStr_c_str() const
728  {
729  _load_buffer_WStr();
730  return m_buffer.mWStrBuffer->c_str();
731  }
732 
734  {
735  return mData.at( loc );
736  }
737 
739  {
740  return mData.at( loc );
741  }
742 
744  {
745  const code_point* ptr = c_str();
746  unicode_char uc;
747  size_t l = _utf16_char_length( ptr[loc] );
748  code_point cp[2] = { /* blame the code beautifier */
749  0, 0
750  };
751  cp[0] = ptr[loc];
752 
753  if ( l == 2 && ( loc + 1 ) < mData.length() ) {
754  cp[1] = ptr[loc+1];
755  }
756  _utf16_to_utf32( cp, uc );
757  return uc;
758  }
759 
761  {
762  code_point cp[2] = { /* blame the code beautifier */
763  0, 0
764  };
765  size_t l = _utf32_to_utf16( ch, cp );
766  unicode_char existingChar = getChar( loc );
767  size_t existingSize = _utf16_char_length( existingChar );
768  size_t newSize = _utf16_char_length( ch );
769 
770  if ( newSize > existingSize ) {
771  at( loc ) = cp[0];
772  insert( loc + 1, 1, cp[1] );
773  return 1;
774  }
775  if ( newSize < existingSize ) {
776  erase( loc, 1 );
777  at( loc ) = cp[0];
778  return -1;
779  }
780 
781  // newSize == existingSize
782  at( loc ) = cp[0];
783  if ( l == 2 ) at( loc + 1 ) = cp[1];
784  return 0;
785  }
786 
788  {
789  iterator i;
790  i.mIter = mData.begin();
791  i.mString = this;
792  return i;
793  }
794 
796  {
797  const_iterator i;
798  i.mIter = const_cast<UString*>( this )->mData.begin();
799  i.mString = const_cast<UString*>( this );
800  return i;
801  }
802 
804  {
805  iterator i;
806  i.mIter = mData.end();
807  i.mString = this;
808  return i;
809  }
810 
812  {
813  const_iterator i;
814  i.mIter = const_cast<UString*>( this )->mData.end();
815  i.mString = const_cast<UString*>( this );
816  return i;
817  }
818 
820  {
822  i.mIter = mData.end();
823  i.mString = this;
824  return i;
825  }
826 
828  {
830  i.mIter = const_cast<UString*>( this )->mData.end();
831  i.mString = const_cast<UString*>( this );
832  return i;
833  }
834 
836  {
838  i.mIter = mData.begin();
839  i.mString = this;
840  return i;
841  }
842 
844  {
846  i.mIter = const_cast<UString*>( this )->mData.begin();
847  i.mString = const_cast<UString*>( this );
848  return i;
849  }
850 
852  {
853  mData.assign( start.mIter, end.mIter );
854  return *this;
855  }
856 
858  {
859  mData.assign( str.mData );
860  return *this;
861  }
862 
864  {
865  mData.assign( str );
866  return *this;
867  }
868 
870  {
871  mData.assign( str, num );
872  return *this;
873  }
874 
876  {
877  mData.assign( str.mData, index, len );
878  return *this;
879  }
880 
882  {
883  mData.assign( num, ch );
884  return *this;
885  }
886 
887  UString& UString::assign( const std::wstring& wstr )
888  {
889  mData.clear();
890  mData.reserve( wstr.length() ); // best guess bulk allocate
891 #ifdef WCHAR_UTF16 // if we're already working in UTF-16, this is easy
892  code_point tmp;
893  std::wstring::const_iterator i, ie = wstr.end();
894  for ( i = wstr.begin(); i != ie; i++ ) {
895  tmp = static_cast<code_point>( *i );
896  mData.push_back( tmp );
897  }
898 #else // otherwise we do it the safe way (which is still 100% safe to pass UTF-16 through, just slower)
899  code_point cp[3] = {0, 0, 0};
900  unicode_char tmp;
901  std::wstring::const_iterator i, ie = wstr.end();
902  for ( i = wstr.begin(); i != ie; i++ ) {
903  tmp = static_cast<unicode_char>( *i );
904  size_t l = _utf32_to_utf16( tmp, cp );
905  if ( l > 0 ) mData.push_back( cp[0] );
906  if ( l > 1 ) mData.push_back( cp[1] );
907  }
908 #endif
909  return *this;
910  }
911 
912 #if MYGUI_IS_NATIVE_WCHAR_T
913  UString& UString::assign( const wchar_t* w_str )
914  {
915  std::wstring tmp;
916  tmp.assign( w_str );
917  return assign( tmp );
918  }
919 
920  UString& UString::assign( const wchar_t* w_str, size_type num )
921  {
922  std::wstring tmp;
923  tmp.assign( w_str, num );
924  return assign( tmp );
925  }
926 #endif
927 
928  UString& UString::assign( const std::string& str )
929  {
930  size_type len = _verifyUTF8( str );
931  clear(); // empty our contents, if there are any
932  reserve( len ); // best guess bulk capacity growth
933 
934  // This is a 3 step process, converting each byte in the UTF-8 stream to UTF-32,
935  // then converting it to UTF-16, then finally appending the data buffer
936 
937  unicode_char uc; // temporary Unicode character buffer
938  unsigned char utf8buf[7]; // temporary UTF-8 buffer
939  utf8buf[6] = 0;
940  size_t utf8len; // UTF-8 length
941  code_point utf16buff[3]; // temporary UTF-16 buffer
942  utf16buff[2] = 0;
943  size_t utf16len; // UTF-16 length
944 
945  std::string::const_iterator i, ie = str.end();
946  for ( i = str.begin(); i != ie; i++ ) {
947  utf8len = _utf8_char_length( static_cast<unsigned char>( *i ) ); // estimate bytes to load
948  for ( size_t j = 0; j < utf8len; j++ ) { // load the needed UTF-8 bytes
949  utf8buf[j] = ( static_cast<unsigned char>( *( i + j ) ) ); // we don't increment 'i' here just in case the estimate is wrong (shouldn't happen, but we're being careful)
950  }
951  utf8buf[utf8len] = 0; // nul terminate so we throw an exception before running off the end of the buffer
952  utf8len = _utf8_to_utf32( utf8buf, uc ); // do the UTF-8 -> UTF-32 conversion
953  i += utf8len - 1; // we subtract 1 for the increment of the 'for' loop
954 
955  utf16len = _utf32_to_utf16( uc, utf16buff ); // UTF-32 -> UTF-16 conversion
956  append( utf16buff, utf16len ); // append the characters to the string
957  }
958  return *this;
959  }
960 
961  UString& UString::assign( const char* c_str )
962  {
963  std::string tmp( c_str );
964  return assign( tmp );
965  }
966 
967  UString& UString::assign( const char* c_str, size_type num )
968  {
969  std::string tmp;
970  tmp.assign( c_str, num );
971  return assign( tmp );
972  }
973 
975  {
976  mData.append( str.mData );
977  return *this;
978  }
979 
981  {
982  mData.append( str );
983  return *this;
984  }
985 
987  {
988  mData.append( str.mData, index, len );
989  return *this;
990  }
991 
993  {
994  mData.append( str, num );
995  return *this;
996  }
997 
999  {
1000  mData.append( num, ch );
1001  return *this;
1002  }
1003 
1005  {
1006  mData.append( start.mIter, end.mIter );
1007  return *this;
1008  }
1009 
1010 #if MYGUI_IS_NATIVE_WCHAR_T
1011  UString& UString::append( const wchar_t* w_str, size_type num )
1012  {
1013  std::wstring tmp( w_str, num );
1014  return append( tmp );
1015  }
1016 
1017  UString& UString::append( size_type num, wchar_t ch )
1018  {
1019  return append( num, static_cast<unicode_char>( ch ) );
1020  }
1021 #endif
1023  {
1024  UString tmp( c_str, num );
1025  append( tmp );
1026  return *this;
1027  }
1028 
1030  {
1031  append( num, static_cast<code_point>( ch ) );
1032  return *this;
1033  }
1034 
1036  {
1037  code_point cp[2] = {0, 0};
1038  if ( _utf32_to_utf16( ch, cp ) == 2 ) {
1039  for ( size_type i = 0; i < num; i++ ) {
1040  append( 1, cp[0] );
1041  append( 1, cp[1] );
1042  }
1043  } else {
1044  for ( size_type i = 0; i < num; i++ ) {
1045  append( 1, cp[0] );
1046  }
1047  }
1048  return *this;
1049  }
1050 
1052  {
1053  iterator ret;
1054  ret.mIter = mData.insert( i.mIter, ch );
1055  ret.mString = this;
1056  return ret;
1057  }
1058 
1060  {
1061  mData.insert( index, str.mData );
1062  return *this;
1063  }
1064 
1065  UString& UString::insert( size_type index1, const UString& str, size_type index2, size_type num )
1066  {
1067  mData.insert( index1, str.mData, index2, num );
1068  return *this;
1069  }
1070 
1072  {
1073  mData.insert( i.mIter, start.mIter, end.mIter );
1074  }
1075 
1077  {
1078  mData.insert( index, str, num );
1079  return *this;
1080  }
1081 
1082 #if MYGUI_IS_NATIVE_WCHAR_T
1083  UString& UString::insert( size_type index, const wchar_t* w_str, size_type num )
1084  {
1085  UString tmp( w_str, num );
1086  insert( index, tmp );
1087  return *this;
1088  }
1089 #endif
1090 
1091  UString& UString::insert( size_type index, const char* c_str, size_type num )
1092  {
1093  UString tmp( c_str, num );
1094  insert( index, tmp );
1095  return *this;
1096  }
1097 
1099  {
1100  mData.insert( index, num, ch );
1101  return *this;
1102  }
1103 
1104 #if MYGUI_IS_NATIVE_WCHAR_T
1105  UString& UString::insert( size_type index, size_type num, wchar_t ch )
1106  {
1107  insert( index, num, static_cast<unicode_char>( ch ) );
1108  return *this;
1109  }
1110 #endif
1111 
1112  UString& UString::insert( size_type index, size_type num, char ch )
1113  {
1114  insert( index, num, static_cast<code_point>( ch ) );
1115  return *this;
1116  }
1117 
1119  {
1120  code_point cp[3] = {0, 0, 0};
1121  size_t l = _utf32_to_utf16( ch, cp );
1122  if ( l == 1 ) {
1123  return insert( index, num, cp[0] );
1124  }
1125  for ( size_type c = 0; c < num; c++ ) {
1126  // insert in reverse order to preserve ordering after insert
1127  insert( index, 1, cp[1] );
1128  insert( index, 1, cp[0] );
1129  }
1130  return *this;
1131  }
1132 
1133  void UString::insert( iterator i, size_type num, const code_point& ch )
1134  {
1135  mData.insert( i.mIter, num, ch );
1136  }
1137 #if MYGUI_IS_NATIVE_WCHAR_T
1138  void UString::insert( iterator i, size_type num, const wchar_t& ch )
1139  {
1140  insert( i, num, static_cast<unicode_char>( ch ) );
1141  }
1142 #endif
1143 
1144  void UString::insert( iterator i, size_type num, const char& ch )
1145  {
1146  insert( i, num, static_cast<code_point>( ch ) );
1147  }
1148 
1150  {
1151  code_point cp[3] = {0, 0, 0};
1152  size_t l = _utf32_to_utf16( ch, cp );
1153  if ( l == 1 ) {
1154  insert( i, num, cp[0] );
1155  } else {
1156  for ( size_type c = 0; c < num; c++ ) {
1157  // insert in reverse order to preserve ordering after insert
1158  insert( i, 1, cp[1] );
1159  insert( i, 1, cp[0] );
1160  }
1161  }
1162  }
1163 
1165  {
1166  iterator ret;
1167  ret.mIter = mData.erase( loc.mIter );
1168  ret.mString = this;
1169  return ret;
1170  }
1171 
1173  {
1174  iterator ret;
1175  ret.mIter = mData.erase( start.mIter, end.mIter );
1176  ret.mString = this;
1177  return ret;
1178  }
1179 
1180  UString& UString::erase( size_type index /*= 0*/, size_type num /*= npos */ )
1181  {
1182  if ( num == npos )
1183  mData.erase( index );
1184  else
1185  mData.erase( index, num );
1186  return *this;
1187  }
1188 
1189  UString& UString::replace( size_type index1, size_type num1, const UString& str )
1190  {
1191  mData.replace( index1, num1, str.mData, 0, npos );
1192  return *this;
1193  }
1194 
1195  UString& UString::replace( size_type index1, size_type num1, const UString& str, size_type num2 )
1196  {
1197  mData.replace( index1, num1, str.mData, 0, num2 );
1198  return *this;
1199  }
1200 
1201  UString& UString::replace( size_type index1, size_type num1, const UString& str, size_type index2, size_type num2 )
1202  {
1203  mData.replace( index1, num1, str.mData, index2, num2 );
1204  return *this;
1205  }
1206 
1207  UString& UString::replace( iterator start, iterator end, const UString& str, size_type num /*= npos */ )
1208  {
1209  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1210 
1211  size_type index1 = begin() - st;
1212  size_type num1 = end - st;
1213  return replace( index1, num1, str, 0, num );
1214  }
1215 
1217  {
1218  mData.replace( index, num1, num2, ch );
1219  return *this;
1220  }
1221 
1223  {
1224  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1225 
1226  size_type index1 = begin() - st;
1227  size_type num1 = end - st;
1228  return replace( index1, num1, num, ch );
1229  }
1230 
1231  int UString::compare( const UString& str ) const
1232  {
1233  return mData.compare( str.mData );
1234  }
1235 
1236  int UString::compare( const code_point* str ) const
1237  {
1238  return mData.compare( str );
1239  }
1240 
1241  int UString::compare( size_type index, size_type length, const UString& str ) const
1242  {
1243  return mData.compare( index, length, str.mData );
1244  }
1245 
1246  int UString::compare( size_type index, size_type length, const UString& str, size_type index2, size_type length2 ) const
1247  {
1248  return mData.compare( index, length, str.mData, index2, length2 );
1249  }
1250 
1251  int UString::compare( size_type index, size_type length, const code_point* str, size_type length2 ) const
1252  {
1253  return mData.compare( index, length, str, length2 );
1254  }
1255 
1256 #if MYGUI_IS_NATIVE_WCHAR_T
1257  int UString::compare( size_type index, size_type length, const wchar_t* w_str, size_type length2 ) const
1258  {
1259  UString tmp( w_str, length2 );
1260  return compare( index, length, tmp );
1261  }
1262 #endif
1263 
1264  int UString::compare( size_type index, size_type length, const char* c_str, size_type length2 ) const
1265  {
1266  UString tmp( c_str, length2 );
1267  return compare( index, length, tmp );
1268  }
1269 
1270  UString::size_type UString::find( const UString& str, size_type index /*= 0 */ ) const
1271  {
1272  return mData.find( str.c_str(), index );
1273  }
1274 
1276  {
1277  UString tmp( cp_str );
1278  return mData.find( tmp.c_str(), index, length );
1279  }
1280 
1282  {
1283  UString tmp( c_str );
1284  return mData.find( tmp.c_str(), index, length );
1285  }
1286 
1287 #if MYGUI_IS_NATIVE_WCHAR_T
1288  UString::size_type UString::find( const wchar_t* w_str, size_type index, size_type length ) const
1289  {
1290  UString tmp( w_str );
1291  return mData.find( tmp.c_str(), index, length );
1292  }
1293 #endif
1294 
1295  UString::size_type UString::find( char ch, size_type index /*= 0 */ ) const
1296  {
1297  return find( static_cast<code_point>( ch ), index );
1298  }
1299 
1301  {
1302  return mData.find( ch, index );
1303  }
1304 
1305 #if MYGUI_IS_NATIVE_WCHAR_T
1306  UString::size_type UString::find( wchar_t ch, size_type index /*= 0 */ ) const
1307  {
1308  return find( static_cast<unicode_char>( ch ), index );
1309  }
1310 #endif
1311 
1313  {
1314  code_point cp[3] = {0, 0, 0};
1315  size_t l = _utf32_to_utf16( ch, cp );
1316  return find( UString( cp, l ), index );
1317  }
1318 
1319  UString::size_type UString::rfind( const UString& str, size_type index /*= 0 */ ) const
1320  {
1321  return mData.rfind( str.c_str(), index );
1322  }
1323 
1325  {
1326  UString tmp( cp_str );
1327  return mData.rfind( tmp.c_str(), index, num );
1328  }
1329 
1330  UString::size_type UString::rfind( const char* c_str, size_type index, size_type num ) const
1331  {
1332  UString tmp( c_str );
1333  return mData.rfind( tmp.c_str(), index, num );
1334  }
1335 
1336 #if MYGUI_IS_NATIVE_WCHAR_T
1337  UString::size_type UString::rfind( const wchar_t* w_str, size_type index, size_type num ) const
1338  {
1339  UString tmp( w_str );
1340  return mData.rfind( tmp.c_str(), index, num );
1341  }
1342 #endif
1343 
1344  UString::size_type UString::rfind( char ch, size_type index /*= 0 */ ) const
1345  {
1346  return rfind( static_cast<code_point>( ch ), index );
1347  }
1348 
1350  {
1351  return mData.rfind( ch, index );
1352  }
1353 
1354 #if MYGUI_IS_NATIVE_WCHAR_T
1355  UString::size_type UString::rfind( wchar_t ch, size_type index /*= 0 */ ) const
1356  {
1357  return rfind( static_cast<unicode_char>( ch ), index );
1358  }
1359 #endif
1360 
1362  {
1363  code_point cp[3] = {0, 0, 0};
1364  size_t l = _utf32_to_utf16( ch, cp );
1365  return rfind( UString( cp, l ), index );
1366  }
1367 
1368  UString::size_type UString::find_first_of( const UString &str, size_type index /*= 0*/, size_type num /*= npos */ ) const
1369  {
1370  size_type i = 0;
1371  const size_type len = length();
1372  while ( i < num && ( index + i ) < len ) {
1373  unicode_char ch = getChar( index + i );
1374  if ( str.inString( ch ) )
1375  return index + i;
1376  i += _utf16_char_length( ch ); // increment by the Unicode character length
1377  }
1378  return npos;
1379  }
1380 
1382  {
1383  UString tmp;
1384  tmp.assign( 1, ch );
1385  return find_first_of( tmp, index );
1386  }
1387 
1388  UString::size_type UString::find_first_of( char ch, size_type index /*= 0 */ ) const
1389  {
1390  return find_first_of( static_cast<code_point>( ch ), index );
1391  }
1392 
1393 #if MYGUI_IS_NATIVE_WCHAR_T
1394  UString::size_type UString::find_first_of( wchar_t ch, size_type index /*= 0 */ ) const
1395  {
1396  return find_first_of( static_cast<unicode_char>( ch ), index );
1397  }
1398 #endif
1399 
1401  {
1402  code_point cp[3] = {0, 0, 0};
1403  size_t l = _utf32_to_utf16( ch, cp );
1404  return find_first_of( UString( cp, l ), index );
1405  }
1406 
1407  UString::size_type UString::find_first_not_of( const UString& str, size_type index /*= 0*/, size_type num /*= npos */ ) const
1408  {
1409  size_type i = 0;
1410  const size_type len = length();
1411  while ( i < num && ( index + i ) < len ) {
1412  unicode_char ch = getChar( index + i );
1413  if ( !str.inString( ch ) )
1414  return index + i;
1415  i += _utf16_char_length( ch ); // increment by the Unicode character length
1416  }
1417  return npos;
1418  }
1419 
1421  {
1422  UString tmp;
1423  tmp.assign( 1, ch );
1424  return find_first_not_of( tmp, index );
1425  }
1426 
1428  {
1429  return find_first_not_of( static_cast<code_point>( ch ), index );
1430  }
1431 
1432 #if MYGUI_IS_NATIVE_WCHAR_T
1433  UString::size_type UString::find_first_not_of( wchar_t ch, size_type index /*= 0 */ ) const
1434  {
1435  return find_first_not_of( static_cast<unicode_char>( ch ), index );
1436  }
1437 #endif
1438 
1440  {
1441  code_point cp[3] = {0, 0, 0};
1442  size_t l = _utf32_to_utf16( ch, cp );
1443  return find_first_not_of( UString( cp, l ), index );
1444  }
1445 
1446  UString::size_type UString::find_last_of( const UString& str, size_type index /*= npos*/, size_type num /*= npos */ ) const
1447  {
1448  size_type i = 0;
1449  const size_type len = length();
1450  if ( index > len ) index = len - 1;
1451 
1452  while ( i < num && ( index - i ) != npos ) {
1453  size_type j = index - i;
1454  // careful to step full Unicode characters
1455  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) ) {
1456  j = index - ++i;
1457  }
1458  // and back to the usual dull test
1459  unicode_char ch = getChar( j );
1460  if ( str.inString( ch ) )
1461  return j;
1462  i++;
1463  }
1464  return npos;
1465  }
1466 
1468  {
1469  UString tmp;
1470  tmp.assign( 1, ch );
1471  return find_last_of( tmp, index );
1472  }
1473 
1474 #if MYGUI_IS_NATIVE_WCHAR_T
1475  UString::size_type UString::find_last_of( wchar_t ch, size_type index /*= npos */ ) const
1476  {
1477  return find_last_of( static_cast<unicode_char>( ch ), index );
1478  }
1479 #endif
1480 
1482  {
1483  code_point cp[3] = {0, 0, 0};
1484  size_t l = _utf32_to_utf16( ch, cp );
1485  return find_last_of( UString( cp, l ), index );
1486  }
1487 
1488  UString::size_type UString::find_last_not_of( const UString& str, size_type index /*= npos*/, size_type num /*= npos */ ) const
1489  {
1490  size_type i = 0;
1491  const size_type len = length();
1492  if ( index > len ) index = len - 1;
1493 
1494  while ( i < num && ( index - i ) != npos ) {
1495  size_type j = index - i;
1496  // careful to step full Unicode characters
1497  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) ) {
1498  j = index - ++i;
1499  }
1500  // and back to the usual dull test
1501  unicode_char ch = getChar( j );
1502  if ( !str.inString( ch ) )
1503  return j;
1504  i++;
1505  }
1506  return npos;
1507  }
1508 
1510  {
1511  UString tmp;
1512  tmp.assign( 1, ch );
1513  return find_last_not_of( tmp, index );
1514  }
1515 
1516  UString::size_type UString::find_last_not_of( char ch, size_type index /*= npos */ ) const
1517  {
1518  return find_last_not_of( static_cast<code_point>( ch ), index );
1519  }
1520 
1521 #if MYGUI_IS_NATIVE_WCHAR_T
1522  UString::size_type UString::find_last_not_of( wchar_t ch, size_type index /*= npos */ ) const
1523  {
1524  return find_last_not_of( static_cast<unicode_char>( ch ), index );
1525  }
1526 #endif
1527 
1529  {
1530  code_point cp[3] = {0, 0, 0};
1531  size_t l = _utf32_to_utf16( ch, cp );
1532  return find_last_not_of( UString( cp, l ), index );
1533  }
1534 
1535  bool UString::operator<( const UString& right ) const
1536  {
1537  return compare( right ) < 0;
1538  }
1539 
1540  bool UString::operator<=( const UString& right ) const
1541  {
1542  return compare( right ) <= 0;
1543  }
1544 
1546  {
1547  return assign( s );
1548  }
1549 
1551  {
1552  clear();
1553  return append( 1, ch );
1554  }
1555 
1557  {
1558  clear();
1559  return append( 1, ch );
1560  }
1561 
1562 #if MYGUI_IS_NATIVE_WCHAR_T
1563  UString& UString::operator=( wchar_t ch )
1564  {
1565  clear();
1566  return append( 1, ch );
1567  }
1568 #endif
1569 
1571  {
1572  clear();
1573  return append( 1, ch );
1574  }
1575 
1576  bool UString::operator>( const UString& right ) const
1577  {
1578  return compare( right ) > 0;
1579  }
1580 
1581  bool UString::operator>=( const UString& right ) const
1582  {
1583  return compare( right ) >= 0;
1584  }
1585 
1586  bool UString::operator==( const UString& right ) const
1587  {
1588  return compare( right ) == 0;
1589  }
1590 
1591  bool UString::operator!=( const UString& right ) const
1592  {
1593  return !operator==( right );
1594  }
1595 
1597  {
1598  return at( index );
1599  }
1600 
1602  {
1603  return at( index );
1604  }
1605 
1606  UString::operator std::string() const
1607  {
1608  return std::string( asUTF8() );
1609  }
1610 
1612  UString::operator std::wstring() const
1613  {
1614  return std::wstring( asWStr() );
1615  }
1616 
1617 
1619  {
1620  if ( 0xD800 <= cp && cp <= 0xDFFF ) // tests if the cp is within the surrogate pair range
1621  return false; // it matches a surrogate pair signature
1622  return true; // everything else is a standalone code point
1623  }
1624 
1626  {
1627  if ( 0xD800 <= cp && cp <= 0xDBFF ) // tests if the cp is within the 2nd word of a surrogate pair
1628  return true; // it is a 1st word
1629  return false; // it isn't
1630  }
1631 
1633  {
1634  if ( 0xDC00 <= cp && cp <= 0xDFFF ) // tests if the cp is within the 2nd word of a surrogate pair
1635  return true; // it is a 2nd word
1636  return false; // everything else isn't
1637  }
1638 
1640  {
1641  if ( 0xD800 <= cp && cp <= 0xDBFF ) // test if cp is the beginning of a surrogate pair
1642  return 2; // if it is, then we are 2 words long
1643  return 1; // otherwise we are only 1 word long
1644  }
1645 
1647  {
1648  if ( uc > 0xFFFF ) // test if uc is greater than the single word maximum
1649  return 2; // if so, we need a surrogate pair
1650  return 1; // otherwise we can stuff it into a single word
1651  }
1652 
1653  size_t UString::_utf16_to_utf32( const code_point in_cp[2], unicode_char& out_uc )
1654  {
1655  const code_point& cp1 = in_cp[0];
1656  const code_point& cp2 = in_cp[1];
1657  bool wordPair = false;
1658 
1659  // does it look like a surrogate pair?
1660  if ( 0xD800 <= cp1 && cp1 <= 0xDBFF ) {
1661  // looks like one, but does the other half match the algorithm as well?
1662  if ( 0xDC00 <= cp2 && cp2 <= 0xDFFF )
1663  wordPair = true; // yep!
1664  }
1665 
1666  if ( !wordPair ) { // if we aren't a 100% authentic surrogate pair, then just copy the value
1667  out_uc = cp1;
1668  return 1;
1669  }
1670 
1671  unsigned short cU = cp1, cL = cp2; // copy upper and lower words of surrogate pair to writable buffers
1672  cU -= 0xD800; // remove the encoding markers
1673  cL -= 0xDC00;
1674 
1675  out_uc = ( cU & 0x03FF ) << 10; // grab the 10 upper bits and set them in their proper location
1676  out_uc |= ( cL & 0x03FF ); // combine in the lower 10 bits
1677  out_uc += 0x10000; // add back in the value offset
1678 
1679  return 2; // this whole operation takes to words, so that's what we'll return
1680  }
1681 
1682  size_t UString::_utf32_to_utf16( const unicode_char& in_uc, code_point out_cp[2] )
1683  {
1684  if ( in_uc <= 0xFFFF ) { // we blindly preserve sentinel values because our decoder understands them
1685  out_cp[0] = static_cast<code_point>(in_uc);
1686  return 1;
1687  }
1688  unicode_char uc = in_uc; // copy to writable buffer
1689  unsigned short tmp; // single code point buffer
1690  uc -= 0x10000; // subtract value offset
1691 
1692  //process upper word
1693  tmp = static_cast<unsigned short>(( uc >> 10 ) & 0x03FF); // grab the upper 10 bits
1694  tmp += 0xD800; // add encoding offset
1695  out_cp[0] = tmp; // write
1696 
1697  // process lower word
1698  tmp = static_cast<unsigned short>(uc & 0x03FF); // grab the lower 10 bits
1699  tmp += 0xDC00; // add encoding offset
1700  out_cp[1] = tmp; // write
1701 
1702  return 2; // return used word count (2 for surrogate pairs)
1703  }
1704 
1705  bool UString::_utf8_start_char( unsigned char cp )
1706  {
1707  return ( cp & ~_cont_mask ) != _cont;
1708  }
1709 
1710  size_t UString::_utf8_char_length( unsigned char cp )
1711  {
1712  if ( !( cp & 0x80 ) ) return 1;
1713  if (( cp & ~_lead1_mask ) == _lead1 ) return 2;
1714  if (( cp & ~_lead2_mask ) == _lead2 ) return 3;
1715  if (( cp & ~_lead3_mask ) == _lead3 ) return 4;
1716  if (( cp & ~_lead4_mask ) == _lead4 ) return 5;
1717  if (( cp & ~_lead5_mask ) == _lead5 ) return 6;
1718 
1719  return 1;
1720  //throw invalid_data( "invalid UTF-8 sequence header value" );
1721  }
1722 
1724  {
1725  /*
1726  7 bit: U-00000000 - U-0000007F: 0xxxxxxx
1727  11 bit: U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
1728  16 bit: U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
1729  21 bit: U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1730  26 bit: U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1731  31 bit: U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1732  */
1733  if ( !( uc & ~0x0000007F ) ) return 1;
1734  if ( !( uc & ~0x000007FF ) ) return 2;
1735  if ( !( uc & ~0x0000FFFF ) ) return 3;
1736  if ( !( uc & ~0x001FFFFF ) ) return 4;
1737  if ( !( uc & ~0x03FFFFFF ) ) return 5;
1738  if ( !( uc & ~0x7FFFFFFF ) ) return 6;
1739 
1740  return 1;
1741  //throw invalid_data( "invalid UTF-32 value" );
1742  }
1743 
1744  size_t UString::_utf8_to_utf32( const unsigned char in_cp[6], unicode_char& out_uc )
1745  {
1746  size_t len = _utf8_char_length( in_cp[0] );
1747  if ( len == 1 ) { // if we are only 1 byte long, then just grab it and exit
1748  out_uc = in_cp[0];
1749  return 1;
1750  }
1751 
1752  unicode_char c = 0; // temporary buffer
1753  size_t i = 0;
1754  switch ( len ) { // load header byte
1755  case 6:
1756  c = in_cp[i] & _lead5_mask;
1757  break;
1758  case 5:
1759  c = in_cp[i] & _lead4_mask;
1760  break;
1761  case 4:
1762  c = in_cp[i] & _lead3_mask;
1763  break;
1764  case 3:
1765  c = in_cp[i] & _lead2_mask;
1766  break;
1767  case 2:
1768  c = in_cp[i] & _lead1_mask;
1769  break;
1770  }
1771 
1772  // load each continuation byte
1773  for ( ++i; i < len; i++ )
1774  {
1775  if (( in_cp[i] & ~_cont_mask ) != _cont )
1776  {
1777  //throw invalid_data( "bad UTF-8 continuation byte" );
1778  out_uc = in_cp[0];
1779  return 1;
1780  }
1781  c <<= 6;
1782  c |= ( in_cp[i] & _cont_mask );
1783  }
1784 
1785  out_uc = c; // write the final value and return the used byte length
1786  return len;
1787  }
1788 
1789  size_t UString::_utf32_to_utf8( const unicode_char& in_uc, unsigned char out_cp[6] )
1790  {
1791  size_t len = _utf8_char_length( in_uc ); // predict byte length of sequence
1792  unicode_char c = in_uc; // copy to temp buffer
1793 
1794  //stuff all of the lower bits
1795  for ( size_t i = len - 1; i > 0; i-- ) {
1796  out_cp[i] = static_cast<unsigned char>((( c ) & _cont_mask ) | _cont);
1797  c >>= 6;
1798  }
1799 
1800  //now write the header byte
1801  switch ( len ) {
1802  case 6:
1803  out_cp[0] = static_cast<unsigned char>((( c ) & _lead5_mask ) | _lead5);
1804  break;
1805  case 5:
1806  out_cp[0] = static_cast<unsigned char>((( c ) & _lead4_mask ) | _lead4);
1807  break;
1808  case 4:
1809  out_cp[0] = static_cast<unsigned char>((( c ) & _lead3_mask ) | _lead3);
1810  break;
1811  case 3:
1812  out_cp[0] = static_cast<unsigned char>((( c ) & _lead2_mask ) | _lead2);
1813  break;
1814  case 2:
1815  out_cp[0] = static_cast<unsigned char>((( c ) & _lead1_mask ) | _lead1);
1816  break;
1817  case 1:
1818  default:
1819  out_cp[0] = static_cast<unsigned char>(( c ) & 0x7F);
1820  break;
1821  }
1822 
1823  // return the byte length of the sequence
1824  return len;
1825  }
1826 
1828  {
1829  std::string tmp( reinterpret_cast<const char*>( c_str ) );
1830  return _verifyUTF8( tmp );
1831  }
1832 
1833  UString::size_type UString::_verifyUTF8( const std::string& str )
1834  {
1835  std::string::const_iterator i, ie = str.end();
1836  i = str.begin();
1837  size_type length = 0;
1838 
1839  while ( i != ie ) {
1840  // characters pass until we find an extended sequence
1841  if (( *i ) & 0x80 ) {
1842  unsigned char c = ( *i );
1843  size_t contBytes = 0;
1844 
1845  // get continuation byte count and test for overlong sequences
1846  if (( c & ~_lead1_mask ) == _lead1 ) { // 1 additional byte
1847  if ( c == _lead1 )
1848  {
1849  //throw invalid_data( "overlong UTF-8 sequence" );
1850  return str.size();
1851  }
1852  contBytes = 1;
1853 
1854  } else if (( c & ~_lead2_mask ) == _lead2 ) { // 2 additional bytes
1855  contBytes = 2;
1856  if ( c == _lead2 ) { // possible overlong UTF-8 sequence
1857  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1858  if (( c & _lead2 ) == _cont )
1859  {
1860  //throw invalid_data( "overlong UTF-8 sequence" );
1861  return str.size();
1862  }
1863  }
1864 
1865  } else if (( c & ~_lead3_mask ) == _lead3 ) { // 3 additional bytes
1866  contBytes = 3;
1867  if ( c == _lead3 ) { // possible overlong UTF-8 sequence
1868  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1869  if (( c & _lead3 ) == _cont )
1870  {
1871  //throw invalid_data( "overlong UTF-8 sequence" );
1872  return str.size();
1873  }
1874  }
1875 
1876  } else if (( c & ~_lead4_mask ) == _lead4 ) { // 4 additional bytes
1877  contBytes = 4;
1878  if ( c == _lead4 ) { // possible overlong UTF-8 sequence
1879  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1880  if (( c & _lead4 ) == _cont )
1881  {
1882  //throw invalid_data( "overlong UTF-8 sequence" );
1883  return str.size();
1884  }
1885  }
1886 
1887  } else if (( c & ~_lead5_mask ) == _lead5 ) { // 5 additional bytes
1888  contBytes = 5;
1889  if ( c == _lead5 ) { // possible overlong UTF-8 sequence
1890  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1891  if (( c & _lead5 ) == _cont )
1892  {
1893  //throw invalid_data( "overlong UTF-8 sequence" );
1894  return str.size();
1895  }
1896  }
1897  }
1898 
1899  // check remaining continuation bytes for
1900  while ( contBytes-- ) {
1901  c = ( *( ++i ) ); // get next byte in sequence
1902  if (( c & ~_cont_mask ) != _cont )
1903  {
1904  //throw invalid_data( "bad UTF-8 continuation byte" );
1905  return str.size();
1906  }
1907  }
1908  }
1909  length++;
1910  i++;
1911  }
1912  return length;
1913  }
1914 
1915  void UString::_init()
1916  {
1917  m_buffer.mVoidBuffer = 0;
1918  m_bufferType = bt_none;
1919  m_bufferSize = 0;
1920  }
1921 
1922  void UString::_cleanBuffer() const
1923  {
1924  if ( m_buffer.mVoidBuffer != 0 ) {
1925  switch ( m_bufferType ) {
1926  case bt_string:
1927  delete m_buffer.mStrBuffer;
1928  break;
1929  case bt_wstring:
1930  delete m_buffer.mWStrBuffer;
1931  break;
1932  case bt_utf32string:
1933  delete m_buffer.mUTF32StrBuffer;
1934  break;
1935  case bt_none: // under the worse of circumstances, this is all we can do, and hope it works out
1936  default:
1937  //delete m_buffer.mVoidBuffer;
1938  // delete void* is undefined, don't do that
1939  assert("This should never happen - mVoidBuffer should never contain something if we "
1940  "don't know the type");
1941  break;
1942  }
1943  m_buffer.mVoidBuffer = 0;
1944  m_bufferSize = 0;
1945  m_bufferType = bt_none;
1946  }
1947  }
1948 
1949  void UString::_getBufferStr() const
1950  {
1951  if ( m_bufferType != bt_string ) {
1952  _cleanBuffer();
1953  m_buffer.mStrBuffer = new std::string();
1954  m_bufferType = bt_string;
1955  }
1956  m_buffer.mStrBuffer->clear();
1957  }
1958 
1959  void UString::_getBufferWStr() const
1960  {
1961  if ( m_bufferType != bt_wstring ) {
1962  _cleanBuffer();
1963  m_buffer.mWStrBuffer = new std::wstring();
1964  m_bufferType = bt_wstring;
1965  }
1966  m_buffer.mWStrBuffer->clear();
1967  }
1968 
1969  void UString::_getBufferUTF32Str() const
1970  {
1971  if ( m_bufferType != bt_utf32string ) {
1972  _cleanBuffer();
1973  m_buffer.mUTF32StrBuffer = new utf32string();
1974  m_bufferType = bt_utf32string;
1975  }
1976  m_buffer.mUTF32StrBuffer->clear();
1977  }
1978 
1979  void UString::_load_buffer_UTF8() const
1980  {
1981  _getBufferStr();
1982  std::string& buffer = ( *m_buffer.mStrBuffer );
1983  buffer.reserve( length() );
1984 
1985  unsigned char utf8buf[6];
1986  char* charbuf = ( char* )utf8buf;
1987  unicode_char c;
1988  size_t len;
1989 
1990  const_iterator i, ie = end();
1991  for ( i = begin(); i != ie; i.moveNext() ) {
1992  c = i.getCharacter();
1993  len = _utf32_to_utf8( c, utf8buf );
1994  size_t j = 0;
1995  while ( j < len )
1996  buffer.push_back( charbuf[j++] );
1997  }
1998  }
1999 
2000  void UString::_load_buffer_WStr() const
2001  {
2002  _getBufferWStr();
2003  std::wstring& buffer = ( *m_buffer.mWStrBuffer );
2004  buffer.reserve( length() ); // may over reserve, but should be close enough
2005 #ifdef WCHAR_UTF16 // wchar_t matches UTF-16
2006  const_iterator i, ie = end();
2007  for ( i = begin(); i != ie; ++i ) {
2008  buffer.push_back(( wchar_t )( *i ) );
2009  }
2010 #else // wchar_t fits UTF-32
2011  unicode_char c;
2012  const_iterator i, ie = end();
2013  for ( i = begin(); i != ie; i.moveNext() ) {
2014  c = i.getCharacter();
2015  buffer.push_back(( wchar_t )c );
2016  }
2017 #endif
2018  }
2019 
2020  void UString::_load_buffer_UTF32() const
2021  {
2022  _getBufferUTF32Str();
2023  utf32string& buffer = ( *m_buffer.mUTF32StrBuffer );
2024  buffer.reserve( length() ); // may over reserve, but should be close enough
2025 
2026  unicode_char c;
2027 
2028  const_iterator i, ie = end();
2029  for ( i = begin(); i != ie; i.moveNext() ) {
2030  c = i.getCharacter();
2031  buffer.push_back( c );
2032  }
2033  }
2034 
2035 } // namespace MyGUI