/[dtapublic]/projs/ets/trunk/src/c_tcl_base_7_5_w_mods/tclutf.c
ViewVC logotype

Diff of /projs/ets/trunk/src/c_tcl_base_7_5_w_mods/tclutf.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 67 by dashley, Mon Oct 31 00:57:34 2016 UTC revision 71 by dashley, Sat Nov 5 11:07:06 2016 UTC
# Line 1  Line 1 
1  /* $Header$ */  /* $Header$ */
2  /*  /*
3   * tclUtf.c --   * tclUtf.c --
4   *   *
5   *      Routines for manipulating UTF-8 strings.   *      Routines for manipulating UTF-8 strings.
6   *   *
7   * Copyright (c) 1997-1998 Sun Microsystems, Inc.   * Copyright (c) 1997-1998 Sun Microsystems, Inc.
8   *   *
9   * See the file "license.terms" for information on usage and redistribution   * See the file "license.terms" for information on usage and redistribution
10   * of this file, and for a DISCLAIMER OF ALL WARRANTIES.   * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11   *   *
12   * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $   * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $
13   */   */
14    
15  #include "tclInt.h"  #include "tclInt.h"
16    
17  /*  /*
18   * Include the static character classification tables and macros.   * Include the static character classification tables and macros.
19   */   */
20    
21  #include "tclUniData.c"  #include "tclUniData.c"
22    
23  /*  /*
24   * The following macros are used for fast character category tests.  The   * The following macros are used for fast character category tests.  The
25   * x_BITS values are shifted right by the category value to determine whether   * x_BITS values are shifted right by the category value to determine whether
26   * the given category is included in the set.   * the given category is included in the set.
27   */   */
28    
29  #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \  #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
30      | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))      | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
31    
32  #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)  #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
33    
34  #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \  #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
35      | (1 << PARAGRAPH_SEPARATOR))      | (1 << PARAGRAPH_SEPARATOR))
36    
37  #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)  #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
38    
39  #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \  #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
40              (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \              (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
41              (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \              (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
42              (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \              (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
43              (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \              (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
44              (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \              (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
45              (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \              (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
46              (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \              (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
47              (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))              (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
48    
49  #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \  #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
50              (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \              (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
51              (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \              (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
52              (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))              (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
53    
54  /*  /*
55   * Unicode characters less than this value are represented by themselves   * Unicode characters less than this value are represented by themselves
56   * in UTF-8 strings.   * in UTF-8 strings.
57   */   */
58    
59  #define UNICODE_SELF    0x80  #define UNICODE_SELF    0x80
60    
61  /*  /*
62   * The following structures are used when mapping between Unicode (UCS-2)   * The following structures are used when mapping between Unicode (UCS-2)
63   * and UTF-8.   * and UTF-8.
64   */   */
65    
66  CONST unsigned char totalBytes[256] = {  CONST unsigned char totalBytes[256] = {
67      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74      3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,      3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
75  #if TCL_UTF_MAX > 3  #if TCL_UTF_MAX > 3
76      4,4,4,4,4,4,4,4,      4,4,4,4,4,4,4,4,
77  #else  #else
78      1,1,1,1,1,1,1,1,      1,1,1,1,1,1,1,1,
79  #endif  #endif
80  #if TCL_UTF_MAX > 4  #if TCL_UTF_MAX > 4
81      5,5,5,5,      5,5,5,5,
82  #else  #else
83      1,1,1,1,      1,1,1,1,
84  #endif  #endif
85  #if TCL_UTF_MAX > 5  #if TCL_UTF_MAX > 5
86      6,6,6,6      6,6,6,6
87  #else  #else
88      1,1,1,1      1,1,1,1
89  #endif  #endif
90  };  };
91    
92  /*  /*
93   * Procedures used only in this module.   * Procedures used only in this module.
94   */   */
95    
96  static int UtfCount _ANSI_ARGS_((int ch));  static int UtfCount _ANSI_ARGS_((int ch));
97    
98    
99  /*  /*
100   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
101   *   *
102   * UtfCount --   * UtfCount --
103   *   *
104   *      Find the number of bytes in the Utf character "ch".   *      Find the number of bytes in the Utf character "ch".
105   *   *
106   * Results:   * Results:
107   *      The return values is the number of bytes in the Utf character "ch".   *      The return values is the number of bytes in the Utf character "ch".
108   *   *
109   * Side effects:   * Side effects:
110   *      None.   *      None.
111   *   *
112   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
113   */   */
114    
115  static int  static int
116  UtfCount(ch)  UtfCount(ch)
117      int ch;                     /* The Tcl_UniChar whose size is returned. */      int ch;                     /* The Tcl_UniChar whose size is returned. */
118  {  {
119      if ((ch > 0) && (ch < UNICODE_SELF)) {      if ((ch > 0) && (ch < UNICODE_SELF)) {
120          return 1;          return 1;
121      }      }
122      if (ch <= 0x7FF) {      if (ch <= 0x7FF) {
123          return 2;          return 2;
124      }      }
125      if (ch <= 0xFFFF) {      if (ch <= 0xFFFF) {
126          return 3;          return 3;
127      }      }
128  #if TCL_UTF_MAX > 3  #if TCL_UTF_MAX > 3
129      if (ch <= 0x1FFFFF) {      if (ch <= 0x1FFFFF) {
130          return 4;          return 4;
131      }      }
132      if (ch <= 0x3FFFFFF) {      if (ch <= 0x3FFFFFF) {
133          return 5;          return 5;
134      }      }
135      if (ch <= 0x7FFFFFFF) {      if (ch <= 0x7FFFFFFF) {
136          return 6;          return 6;
137      }      }
138  #endif  #endif
139      return 3;      return 3;
140  }  }
141    
142  /*  /*
143   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
144   *   *
145   * Tcl_UniCharToUtf --   * Tcl_UniCharToUtf --
146   *   *
147   *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the   *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
148   *      provided buffer.  Equivalent to Plan 9 runetochar().   *      provided buffer.  Equivalent to Plan 9 runetochar().
149   *   *
150   * Results:   * Results:
151   *      The return values is the number of bytes in the buffer that   *      The return values is the number of bytes in the buffer that
152   *      were consumed.     *      were consumed.  
153   *   *
154   * Side effects:   * Side effects:
155   *      None.   *      None.
156   *   *
157   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
158   */   */
159    
160  INLINE int  INLINE int
161  Tcl_UniCharToUtf(ch, str)  Tcl_UniCharToUtf(ch, str)
162      int ch;                     /* The Tcl_UniChar to be stored in the      int ch;                     /* The Tcl_UniChar to be stored in the
163                                   * buffer. */                                   * buffer. */
164      char *str;                  /* Buffer in which the UTF-8 representation      char *str;                  /* Buffer in which the UTF-8 representation
165                                   * of the Tcl_UniChar is stored.  Buffer must                                   * of the Tcl_UniChar is stored.  Buffer must
166                                   * be large enough to hold the UTF-8 character                                   * be large enough to hold the UTF-8 character
167                                   * (at most TCL_UTF_MAX bytes). */                                   * (at most TCL_UTF_MAX bytes). */
168  {  {
169      if ((ch > 0) && (ch < UNICODE_SELF)) {      if ((ch > 0) && (ch < UNICODE_SELF)) {
170          str[0] = (char) ch;          str[0] = (char) ch;
171          return 1;          return 1;
172      }      }
173      if (ch <= 0x7FF) {      if (ch <= 0x7FF) {
174          str[1] = (char) ((ch | 0x80) & 0xBF);          str[1] = (char) ((ch | 0x80) & 0xBF);
175          str[0] = (char) ((ch >> 6) | 0xC0);          str[0] = (char) ((ch >> 6) | 0xC0);
176          return 2;          return 2;
177      }      }
178      if (ch <= 0xFFFF) {      if (ch <= 0xFFFF) {
179          three:          three:
180          str[2] = (char) ((ch | 0x80) & 0xBF);          str[2] = (char) ((ch | 0x80) & 0xBF);
181          str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);          str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
182          str[0] = (char) ((ch >> 12) | 0xE0);          str[0] = (char) ((ch >> 12) | 0xE0);
183          return 3;          return 3;
184      }      }
185    
186  #if TCL_UTF_MAX > 3  #if TCL_UTF_MAX > 3
187      if (ch <= 0x1FFFFF) {      if (ch <= 0x1FFFFF) {
188          str[3] = (char) ((ch | 0x80) & 0xBF);          str[3] = (char) ((ch | 0x80) & 0xBF);
189          str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);          str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
190          str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);          str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
191          str[0] = (char) ((ch >> 18) | 0xF0);          str[0] = (char) ((ch >> 18) | 0xF0);
192          return 4;          return 4;
193      }      }
194      if (ch <= 0x3FFFFFF) {      if (ch <= 0x3FFFFFF) {
195          str[4] = (char) ((ch | 0x80) & 0xBF);          str[4] = (char) ((ch | 0x80) & 0xBF);
196          str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);          str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
197          str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);          str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
198          str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);          str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
199          str[0] = (char) ((ch >> 24) | 0xF8);          str[0] = (char) ((ch >> 24) | 0xF8);
200          return 5;          return 5;
201      }      }
202      if (ch <= 0x7FFFFFFF) {      if (ch <= 0x7FFFFFFF) {
203          str[5] = (char) ((ch | 0x80) & 0xBF);          str[5] = (char) ((ch | 0x80) & 0xBF);
204          str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);          str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
205          str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);          str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
206          str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);          str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
207          str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);          str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
208          str[0] = (char) ((ch >> 30) | 0xFC);          str[0] = (char) ((ch >> 30) | 0xFC);
209          return 6;          return 6;
210      }      }
211  #endif  #endif
212    
213      ch = 0xFFFD;      ch = 0xFFFD;
214      goto three;      goto three;
215  }  }
216    
217  /*  /*
218   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
219   *   *
220   * Tcl_UniCharToUtfDString --   * Tcl_UniCharToUtfDString --
221   *   *
222   *      Convert the given Unicode string to UTF-8.   *      Convert the given Unicode string to UTF-8.
223   *   *
224   * Results:   * Results:
225   *      The return value is a pointer to the UTF-8 representation of the   *      The return value is a pointer to the UTF-8 representation of the
226   *      Unicode string.  Storage for the return value is appended to the   *      Unicode string.  Storage for the return value is appended to the
227   *      end of dsPtr.   *      end of dsPtr.
228   *   *
229   * Side effects:   * Side effects:
230   *      None.   *      None.
231   *   *
232   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
233   */   */
234    
235  char *  char *
236  Tcl_UniCharToUtfDString(wString, numChars, dsPtr)  Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
237      CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */      CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
238      int numChars;               /* Length of Unicode string in Tcl_UniChars      int numChars;               /* Length of Unicode string in Tcl_UniChars
239                                   * (must be >= 0). */                                   * (must be >= 0). */
240      Tcl_DString *dsPtr;         /* UTF-8 representation of string is      Tcl_DString *dsPtr;         /* UTF-8 representation of string is
241                                   * appended to this previously initialized                                   * appended to this previously initialized
242                                   * DString. */                                   * DString. */
243  {  {
244      CONST Tcl_UniChar *w, *wEnd;      CONST Tcl_UniChar *w, *wEnd;
245      char *p, *string;      char *p, *string;
246      int oldLength;      int oldLength;
247    
248      /*      /*
249       * UTF-8 string length in bytes will be <= Unicode string length *       * UTF-8 string length in bytes will be <= Unicode string length *
250       * TCL_UTF_MAX.       * TCL_UTF_MAX.
251       */       */
252    
253      oldLength = Tcl_DStringLength(dsPtr);      oldLength = Tcl_DStringLength(dsPtr);
254      Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);      Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
255      string = Tcl_DStringValue(dsPtr) + oldLength;      string = Tcl_DStringValue(dsPtr) + oldLength;
256    
257      p = string;      p = string;
258      wEnd = wString + numChars;      wEnd = wString + numChars;
259      for (w = wString; w < wEnd; ) {      for (w = wString; w < wEnd; ) {
260          p += Tcl_UniCharToUtf(*w, p);          p += Tcl_UniCharToUtf(*w, p);
261          w++;          w++;
262      }      }
263      Tcl_DStringSetLength(dsPtr, oldLength + (p - string));      Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
264    
265      return string;      return string;
266  }  }
267    
268  /*  /*
269   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
270   *   *
271   * Tcl_UtfToUniChar --   * Tcl_UtfToUniChar --
272   *   *
273   *      Extract the Tcl_UniChar represented by the UTF-8 string.  Bad   *      Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
274   *      UTF-8 sequences are converted to valid Tcl_UniChars and processing   *      UTF-8 sequences are converted to valid Tcl_UniChars and processing
275   *      continues.  Equivalent to Plan 9 chartorune().   *      continues.  Equivalent to Plan 9 chartorune().
276   *   *
277   *      The caller must ensure that the source buffer is long enough that   *      The caller must ensure that the source buffer is long enough that
278   *      this routine does not run off the end and dereference non-existent   *      this routine does not run off the end and dereference non-existent
279   *      memory looking for trail bytes.  If the source buffer is known to   *      memory looking for trail bytes.  If the source buffer is known to
280   *      be '\0' terminated, this cannot happen.  Otherwise, the caller   *      be '\0' terminated, this cannot happen.  Otherwise, the caller
281   *      should call Tcl_UtfCharComplete() before calling this routine to   *      should call Tcl_UtfCharComplete() before calling this routine to
282   *      ensure that enough bytes remain in the string.   *      ensure that enough bytes remain in the string.
283   *   *
284   * Results:   * Results:
285   *      *chPtr is filled with the Tcl_UniChar, and the return value is the   *      *chPtr is filled with the Tcl_UniChar, and the return value is the
286   *      number of bytes from the UTF-8 string that were consumed.   *      number of bytes from the UTF-8 string that were consumed.
287   *   *
288   * Side effects:   * Side effects:
289   *      None.   *      None.
290   *   *
291   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
292   */   */
293    
294  int  int
295  Tcl_UtfToUniChar(str, chPtr)  Tcl_UtfToUniChar(str, chPtr)
296      register CONST char *str;    /* The UTF-8 string. */      register CONST char *str;    /* The UTF-8 string. */
297      register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented      register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
298                                    * by the UTF-8 string. */                                    * by the UTF-8 string. */
299  {  {
300      register int byte;      register int byte;
301            
302      /*      /*
303       * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.       * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
304       */       */
305    
306      byte = *((unsigned char *) str);      byte = *((unsigned char *) str);
307      if (byte < 0xC0) {      if (byte < 0xC0) {
308          /*          /*
309           * Handles properly formed UTF-8 characters between 0x01 and 0x7F.           * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
310           * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid           * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
311           * characters representing themselves.           * characters representing themselves.
312           */           */
313                    
314          *chPtr = (Tcl_UniChar) byte;          *chPtr = (Tcl_UniChar) byte;
315          return 1;          return 1;
316      } else if (byte < 0xE0) {      } else if (byte < 0xE0) {
317          if ((str[1] & 0xC0) == 0x80) {          if ((str[1] & 0xC0) == 0x80) {
318              /*              /*
319               * Two-byte-character lead-byte followed by a trail-byte.               * Two-byte-character lead-byte followed by a trail-byte.
320               */               */
321                            
322              *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));              *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
323              return 2;              return 2;
324          }          }
325          /*          /*
326           * A two-byte-character lead-byte not followed by trail-byte           * A two-byte-character lead-byte not followed by trail-byte
327           * represents itself.           * represents itself.
328           */           */
329                    
330          *chPtr = (Tcl_UniChar) byte;          *chPtr = (Tcl_UniChar) byte;
331          return 1;          return 1;
332      } else if (byte < 0xF0) {      } else if (byte < 0xF0) {
333          if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {          if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
334              /*              /*
335               * Three-byte-character lead byte followed by two trail bytes.               * Three-byte-character lead byte followed by two trail bytes.
336               */               */
337    
338              *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)              *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339                      | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));                      | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
340              return 3;              return 3;
341          }          }
342          /*          /*
343           * A three-byte-character lead-byte not followed by two trail-bytes           * A three-byte-character lead-byte not followed by two trail-bytes
344           * represents itself.           * represents itself.
345           */           */
346    
347          *chPtr = (Tcl_UniChar) byte;          *chPtr = (Tcl_UniChar) byte;
348          return 1;          return 1;
349      }      }
350  #if TCL_UTF_MAX > 3  #if TCL_UTF_MAX > 3
351      else {      else {
352          int ch, total, trail;          int ch, total, trail;
353    
354          total = totalBytes[byte];          total = totalBytes[byte];
355          trail = total - 1;          trail = total - 1;
356          if (trail > 0) {          if (trail > 0) {
357              ch = byte & (0x3F >> trail);              ch = byte & (0x3F >> trail);
358              do {              do {
359                  str++;                  str++;
360                  if ((*str & 0xC0) != 0x80) {                  if ((*str & 0xC0) != 0x80) {
361                      *chPtr = byte;                      *chPtr = byte;
362                      return 1;                      return 1;
363                  }                  }
364                  ch <<= 6;                  ch <<= 6;
365                  ch |= (*str & 0x3F);                  ch |= (*str & 0x3F);
366                  trail--;                  trail--;
367              } while (trail > 0);              } while (trail > 0);
368              *chPtr = ch;              *chPtr = ch;
369              return total;              return total;
370          }          }
371      }      }
372  #endif  #endif
373    
374      *chPtr = (Tcl_UniChar) byte;      *chPtr = (Tcl_UniChar) byte;
375      return 1;      return 1;
376  }  }
377    
378  /*  /*
379   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
380   *   *
381   * Tcl_UtfToUniCharDString --   * Tcl_UtfToUniCharDString --
382   *   *
383   *      Convert the UTF-8 string to Unicode.   *      Convert the UTF-8 string to Unicode.
384   *   *
385   * Results:   * Results:
386   *      The return value is a pointer to the Unicode representation of the   *      The return value is a pointer to the Unicode representation of the
387   *      UTF-8 string.  Storage for the return value is appended to the   *      UTF-8 string.  Storage for the return value is appended to the
388   *      end of dsPtr.  The Unicode string is terminated with a Unicode   *      end of dsPtr.  The Unicode string is terminated with a Unicode
389   *      NULL character.   *      NULL character.
390   *   *
391   * Side effects:   * Side effects:
392   *      None.   *      None.
393   *   *
394   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
395   */   */
396    
397  Tcl_UniChar *  Tcl_UniChar *
398  Tcl_UtfToUniCharDString(string, length, dsPtr)  Tcl_UtfToUniCharDString(string, length, dsPtr)
399      CONST char *string;         /* UTF-8 string to convert to Unicode. */      CONST char *string;         /* UTF-8 string to convert to Unicode. */
400      int length;                 /* Length of UTF-8 string in bytes, or -1      int length;                 /* Length of UTF-8 string in bytes, or -1
401                                   * for strlen(). */                                   * for strlen(). */
402      Tcl_DString *dsPtr;         /* Unicode representation of string is      Tcl_DString *dsPtr;         /* Unicode representation of string is
403                                   * appended to this previously initialized                                   * appended to this previously initialized
404                                   * DString. */                                   * DString. */
405  {  {
406      Tcl_UniChar *w, *wString;      Tcl_UniChar *w, *wString;
407      CONST char *p, *end;      CONST char *p, *end;
408      int oldLength;      int oldLength;
409    
410      if (length < 0) {      if (length < 0) {
411          length = strlen(string);          length = strlen(string);
412      }      }
413    
414      /*      /*
415       * Unicode string length in Tcl_UniChars will be <= UTF-8 string length       * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
416       * in bytes.       * in bytes.
417       */       */
418    
419      oldLength = Tcl_DStringLength(dsPtr);      oldLength = Tcl_DStringLength(dsPtr);
420      Tcl_DStringSetLength(dsPtr,      Tcl_DStringSetLength(dsPtr,
421              (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));              (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422      wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);      wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423    
424      w = wString;      w = wString;
425      end = string + length;      end = string + length;
426      for (p = string; p < end; ) {      for (p = string; p < end; ) {
427          p += Tcl_UtfToUniChar(p, w);          p += Tcl_UtfToUniChar(p, w);
428          w++;          w++;
429      }      }
430      *w = '\0';      *w = '\0';
431      Tcl_DStringSetLength(dsPtr,      Tcl_DStringSetLength(dsPtr,
432              (oldLength + ((char *) w - (char *) wString)));              (oldLength + ((char *) w - (char *) wString)));
433    
434      return wString;      return wString;
435  }  }
436    
437  /*  /*
438   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
439   *   *
440   * Tcl_UtfCharComplete --   * Tcl_UtfCharComplete --
441   *   *
442   *      Determine if the UTF-8 string of the given length is long enough   *      Determine if the UTF-8 string of the given length is long enough
443   *      to be decoded by Tcl_UtfToUniChar().  This does not ensure that the   *      to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
444   *      UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().   *      UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
445   *   *
446   * Results:   * Results:
447   *      The return value is 0 if the string is not long enough, non-zero   *      The return value is 0 if the string is not long enough, non-zero
448   *      otherwise.   *      otherwise.
449   *   *
450   * Side effects:   * Side effects:
451   *      None.   *      None.
452   *   *
453   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
454   */   */
455    
456  int  int
457  Tcl_UtfCharComplete(str, len)  Tcl_UtfCharComplete(str, len)
458      CONST char *str;            /* String to check if first few bytes      CONST char *str;            /* String to check if first few bytes
459                                   * contain a complete UTF-8 character. */                                   * contain a complete UTF-8 character. */
460      int len;                    /* Length of above string in bytes. */      int len;                    /* Length of above string in bytes. */
461  {  {
462      int ch;      int ch;
463    
464      ch = *((unsigned char *) str);      ch = *((unsigned char *) str);
465      return len >= totalBytes[ch];      return len >= totalBytes[ch];
466  }  }
467    
468  /*  /*
469   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
470   *   *
471   * Tcl_NumUtfChars --   * Tcl_NumUtfChars --
472   *   *
473   *      Returns the number of characters (not bytes) in the UTF-8 string,   *      Returns the number of characters (not bytes) in the UTF-8 string,
474   *      not including the terminating NULL byte.  This is equivalent to   *      not including the terminating NULL byte.  This is equivalent to
475   *      Plan 9 utflen() and utfnlen().   *      Plan 9 utflen() and utfnlen().
476   *   *
477   * Results:   * Results:
478   *      As above.     *      As above.  
479   *   *
480   * Side effects:   * Side effects:
481   *      None.   *      None.
482   *   *
483   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
484   */   */
485    
486  int  int
487  Tcl_NumUtfChars(str, len)  Tcl_NumUtfChars(str, len)
488      register CONST char *str;   /* The UTF-8 string to measure. */      register CONST char *str;   /* The UTF-8 string to measure. */
489      int len;                    /* The length of the string in bytes, or -1      int len;                    /* The length of the string in bytes, or -1
490                                   * for strlen(string). */                                   * for strlen(string). */
491  {  {
492      Tcl_UniChar ch;      Tcl_UniChar ch;
493      register Tcl_UniChar *chPtr = &ch;      register Tcl_UniChar *chPtr = &ch;
494      register int n;      register int n;
495      int i;      int i;
496    
497      /*      /*
498       * The separate implementations are faster.       * The separate implementations are faster.
499       */       */
500            
501      i = 0;      i = 0;
502      if (len < 0) {      if (len < 0) {
503          while (1) {          while (1) {
504              str += Tcl_UtfToUniChar(str, chPtr);              str += Tcl_UtfToUniChar(str, chPtr);
505              if (ch == '\0') {              if (ch == '\0') {
506                  break;                  break;
507              }              }
508              i++;              i++;
509          }          }
510      } else {      } else {
511          while (len > 0) {          while (len > 0) {
512              n = Tcl_UtfToUniChar(str, chPtr);              n = Tcl_UtfToUniChar(str, chPtr);
513              len -= n;              len -= n;
514              str += n;              str += n;
515              i++;              i++;
516          }          }
517      }      }
518      return i;      return i;
519  }  }
520    
521  /*  /*
522   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
523   *   *
524   * Tcl_UtfFindFirst --   * Tcl_UtfFindFirst --
525   *   *
526   *      Returns a pointer to the first occurance of the given Tcl_UniChar   *      Returns a pointer to the first occurance of the given Tcl_UniChar
527   *      in the NULL-terminated UTF-8 string.  The NULL terminator is   *      in the NULL-terminated UTF-8 string.  The NULL terminator is
528   *      considered part of the UTF-8 string.  Equivalent to Plan 9   *      considered part of the UTF-8 string.  Equivalent to Plan 9
529   *      utfrune().   *      utfrune().
530   *   *
531   * Results:   * Results:
532   *      As above.  If the Tcl_UniChar does not exist in the given string,   *      As above.  If the Tcl_UniChar does not exist in the given string,
533   *      the return value is NULL.   *      the return value is NULL.
534   *   *
535   * Side effects:   * Side effects:
536   *      None.   *      None.
537   *   *
538   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
539   */   */
540  char *  char *
541  Tcl_UtfFindFirst(string, ch)  Tcl_UtfFindFirst(string, ch)
542      CONST char *string;         /* The UTF-8 string to be searched. */      CONST char *string;         /* The UTF-8 string to be searched. */
543      int ch;                     /* The Tcl_UniChar to search for. */      int ch;                     /* The Tcl_UniChar to search for. */
544  {  {
545      int len;      int len;
546      Tcl_UniChar find;      Tcl_UniChar find;
547            
548      while (1) {      while (1) {
549          len = Tcl_UtfToUniChar(string, &find);          len = Tcl_UtfToUniChar(string, &find);
550          if (find == ch) {          if (find == ch) {
551              return (char *) string;              return (char *) string;
552          }          }
553          if (*string == '\0') {          if (*string == '\0') {
554              return NULL;              return NULL;
555          }          }
556          string += len;          string += len;
557      }      }
558  }  }
559    
560  /*  /*
561   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
562   *   *
563   * Tcl_UtfFindLast --   * Tcl_UtfFindLast --
564   *   *
565   *      Returns a pointer to the last occurance of the given Tcl_UniChar   *      Returns a pointer to the last occurance of the given Tcl_UniChar
566   *      in the NULL-terminated UTF-8 string.  The NULL terminator is   *      in the NULL-terminated UTF-8 string.  The NULL terminator is
567   *      considered part of the UTF-8 string.  Equivalent to Plan 9   *      considered part of the UTF-8 string.  Equivalent to Plan 9
568   *      utfrrune().   *      utfrrune().
569   *   *
570   * Results:   * Results:
571   *      As above.  If the Tcl_UniChar does not exist in the given string,   *      As above.  If the Tcl_UniChar does not exist in the given string,
572   *      the return value is NULL.   *      the return value is NULL.
573   *   *
574   * Side effects:   * Side effects:
575   *      None.   *      None.
576   *   *
577   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
578   */   */
579    
580  char *  char *
581  Tcl_UtfFindLast(string, ch)  Tcl_UtfFindLast(string, ch)
582      CONST char *string;         /* The UTF-8 string to be searched. */      CONST char *string;         /* The UTF-8 string to be searched. */
583      int ch;                     /* The Tcl_UniChar to search for. */      int ch;                     /* The Tcl_UniChar to search for. */
584  {  {
585      int len;      int len;
586      Tcl_UniChar find;      Tcl_UniChar find;
587      CONST char *last;      CONST char *last;
588                    
589      last = NULL;      last = NULL;
590      while (1) {      while (1) {
591          len = Tcl_UtfToUniChar(string, &find);          len = Tcl_UtfToUniChar(string, &find);
592          if (find == ch) {          if (find == ch) {
593              last = string;              last = string;
594          }          }
595          if (*string == '\0') {          if (*string == '\0') {
596              break;              break;
597          }          }
598          string += len;          string += len;
599      }      }
600      return (char *) last;      return (char *) last;
601  }  }
602    
603  /*  /*
604   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
605   *   *
606   * Tcl_UtfNext --   * Tcl_UtfNext --
607   *   *
608   *      Given a pointer to some current location in a UTF-8 string,   *      Given a pointer to some current location in a UTF-8 string,
609   *      move forward one character.  The caller must ensure that they   *      move forward one character.  The caller must ensure that they
610   *      are not asking for the next character after the last character   *      are not asking for the next character after the last character
611   *      in the string.   *      in the string.
612   *   *
613   * Results:   * Results:
614   *      The return value is the pointer to the next character in   *      The return value is the pointer to the next character in
615   *      the UTF-8 string.   *      the UTF-8 string.
616   *   *
617   * Side effects:   * Side effects:
618   *      None.   *      None.
619   *   *
620   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
621   */   */
622    
623  char *  char *
624  Tcl_UtfNext(str)  Tcl_UtfNext(str)
625      CONST char *str;                /* The current location in the string. */      CONST char *str;                /* The current location in the string. */
626  {  {
627      Tcl_UniChar ch;      Tcl_UniChar ch;
628    
629      return (char *) str + Tcl_UtfToUniChar(str, &ch);      return (char *) str + Tcl_UtfToUniChar(str, &ch);
630  }  }
631    
632  /*  /*
633   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
634   *   *
635   * Tcl_UtfPrev --   * Tcl_UtfPrev --
636   *   *
637   *      Given a pointer to some current location in a UTF-8 string,   *      Given a pointer to some current location in a UTF-8 string,
638   *      move backwards one character.   *      move backwards one character.
639   *   *
640   * Results:   * Results:
641   *      The return value is a pointer to the previous character in the   *      The return value is a pointer to the previous character in the
642   *      UTF-8 string.  If the current location was already at the   *      UTF-8 string.  If the current location was already at the
643   *      beginning of the string, the return value will also be a   *      beginning of the string, the return value will also be a
644   *      pointer to the beginning of the string.   *      pointer to the beginning of the string.
645   *   *
646   * Side effects:   * Side effects:
647   *      None.   *      None.
648   *   *
649   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
650   */   */
651    
652  char *  char *
653  Tcl_UtfPrev(str, start)  Tcl_UtfPrev(str, start)
654      CONST char *str;                /* The current location in the string. */      CONST char *str;                /* The current location in the string. */
655      CONST char *start;              /* Pointer to the beginning of the      CONST char *start;              /* Pointer to the beginning of the
656                                       * string, to avoid going backwards too                                       * string, to avoid going backwards too
657                                       * far. */                                       * far. */
658  {  {
659      CONST char *look;      CONST char *look;
660      int i, byte;      int i, byte;
661            
662      str--;      str--;
663      look = str;      look = str;
664      for (i = 0; i < TCL_UTF_MAX; i++) {      for (i = 0; i < TCL_UTF_MAX; i++) {
665          if (look < start) {          if (look < start) {
666              if (str < start) {              if (str < start) {
667                  str = start;                  str = start;
668              }              }
669              break;              break;
670          }          }
671          byte = *((unsigned char *) look);          byte = *((unsigned char *) look);
672          if (byte < 0x80) {          if (byte < 0x80) {
673              break;              break;
674          }          }
675          if (byte >= 0xC0) {          if (byte >= 0xC0) {
676              if (totalBytes[byte] != i + 1) {              if (totalBytes[byte] != i + 1) {
677                  break;                  break;
678              }              }
679              return (char *) look;              return (char *) look;
680          }          }
681          look--;          look--;
682      }      }
683      return (char *) str;      return (char *) str;
684  }  }
685                
686  /*  /*
687   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
688   *   *
689   * Tcl_UniCharAtIndex --   * Tcl_UniCharAtIndex --
690   *   *
691   *      Returns the Unicode character represented at the specified   *      Returns the Unicode character represented at the specified
692   *      character (not byte) position in the UTF-8 string.   *      character (not byte) position in the UTF-8 string.
693   *   *
694   * Results:   * Results:
695   *      As above.   *      As above.
696   *   *
697   * Side effects:   * Side effects:
698   *      None.   *      None.
699   *   *
700   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
701   */   */
702    
703  Tcl_UniChar  Tcl_UniChar
704  Tcl_UniCharAtIndex(src, index)  Tcl_UniCharAtIndex(src, index)
705      register CONST char *src;   /* The UTF-8 string to dereference. */      register CONST char *src;   /* The UTF-8 string to dereference. */
706      register int index;         /* The position of the desired character. */      register int index;         /* The position of the desired character. */
707  {  {
708      Tcl_UniChar ch;      Tcl_UniChar ch;
709    
710      while (index >= 0) {      while (index >= 0) {
711          index--;          index--;
712          src += Tcl_UtfToUniChar(src, &ch);          src += Tcl_UtfToUniChar(src, &ch);
713      }      }
714      return ch;      return ch;
715  }  }
716    
717  /*  /*
718   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
719   *   *
720   * Tcl_UtfAtIndex --   * Tcl_UtfAtIndex --
721   *   *
722   *      Returns a pointer to the specified character (not byte) position   *      Returns a pointer to the specified character (not byte) position
723   *      in the UTF-8 string.   *      in the UTF-8 string.
724   *   *
725   * Results:   * Results:
726   *      As above.   *      As above.
727   *   *
728   * Side effects:   * Side effects:
729   *      None.   *      None.
730   *   *
731   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
732   */   */
733    
734  char *  char *
735  Tcl_UtfAtIndex(src, index)  Tcl_UtfAtIndex(src, index)
736      register CONST char *src;   /* The UTF-8 string. */      register CONST char *src;   /* The UTF-8 string. */
737      register int index;         /* The position of the desired character. */      register int index;         /* The position of the desired character. */
738  {  {
739      Tcl_UniChar ch;      Tcl_UniChar ch;
740            
741      while (index > 0) {      while (index > 0) {
742          index--;          index--;
743          src += Tcl_UtfToUniChar(src, &ch);          src += Tcl_UtfToUniChar(src, &ch);
744      }      }
745      return (char *) src;      return (char *) src;
746  }  }
747    
748  /*  /*
749   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
750   *   *
751   * Tcl_UtfBackslash --   * Tcl_UtfBackslash --
752   *   *
753   *      Figure out how to handle a backslash sequence.   *      Figure out how to handle a backslash sequence.
754   *   *
755   * Results:   * Results:
756   *      Stores the bytes represented by the backslash sequence in dst and   *      Stores the bytes represented by the backslash sequence in dst and
757   *      returns the number of bytes written to dst.  At most TCL_UTF_MAX   *      returns the number of bytes written to dst.  At most TCL_UTF_MAX
758   *      bytes are written to dst; dst must have been large enough to accept   *      bytes are written to dst; dst must have been large enough to accept
759   *      those bytes.  If readPtr isn't NULL then it is filled in with a   *      those bytes.  If readPtr isn't NULL then it is filled in with a
760   *      count of the number of bytes in the backslash sequence.     *      count of the number of bytes in the backslash sequence.  
761   *   *
762   * Side effects:   * Side effects:
763   *      The maximum number of bytes it takes to represent a Unicode   *      The maximum number of bytes it takes to represent a Unicode
764   *      character in UTF-8 is guaranteed to be less than the number of   *      character in UTF-8 is guaranteed to be less than the number of
765   *      bytes used to express the backslash sequence that represents   *      bytes used to express the backslash sequence that represents
766   *      that Unicode character.  If the target buffer into which the   *      that Unicode character.  If the target buffer into which the
767   *      caller is going to store the bytes that represent the Unicode   *      caller is going to store the bytes that represent the Unicode
768   *      character is at least as large as the source buffer from which   *      character is at least as large as the source buffer from which
769   *      the backslashed sequence was extracted, no buffer overruns should   *      the backslashed sequence was extracted, no buffer overruns should
770   *      occur.   *      occur.
771   *   *
772   *---------------------------------------------------------------------------   *---------------------------------------------------------------------------
773   */   */
774    
775  int  int
776  Tcl_UtfBackslash(src, readPtr, dst)  Tcl_UtfBackslash(src, readPtr, dst)
777      CONST char *src;            /* Points to the backslash character of      CONST char *src;            /* Points to the backslash character of
778                                   * a backslash sequence. */                                   * a backslash sequence. */
779      int *readPtr;               /* Fill in with number of characters read      int *readPtr;               /* Fill in with number of characters read
780                                   * from src, unless NULL. */                                   * from src, unless NULL. */
781      char *dst;                  /* Filled with the bytes represented by the      char *dst;                  /* Filled with the bytes represented by the
782                                   * backslash sequence. */                                   * backslash sequence. */
783  {  {
784      register CONST char *p = src+1;      register CONST char *p = src+1;
785      int result, count, n;      int result, count, n;
786      char buf[TCL_UTF_MAX];      char buf[TCL_UTF_MAX];
787    
788      if (dst == NULL) {      if (dst == NULL) {
789          dst = buf;          dst = buf;
790      }      }
791    
792      count = 2;      count = 2;
793      switch (*p) {      switch (*p) {
794          /*          /*
795           * Note: in the conversions below, use absolute values (e.g.,           * Note: in the conversions below, use absolute values (e.g.,
796           * 0xa) rather than symbolic values (e.g. \n) that get converted           * 0xa) rather than symbolic values (e.g. \n) that get converted
797           * by the compiler.  It's possible that compilers on some           * by the compiler.  It's possible that compilers on some
798           * platforms will do the symbolic conversions differently, which           * platforms will do the symbolic conversions differently, which
799           * could result in non-portable Tcl scripts.           * could result in non-portable Tcl scripts.
800           */           */
801    
802          case 'a':          case 'a':
803              result = 0x7;              result = 0x7;
804              break;              break;
805          case 'b':          case 'b':
806              result = 0x8;              result = 0x8;
807              break;              break;
808          case 'f':          case 'f':
809              result = 0xc;              result = 0xc;
810              break;              break;
811          case 'n':          case 'n':
812              result = 0xa;              result = 0xa;
813              break;              break;
814          case 'r':          case 'r':
815              result = 0xd;              result = 0xd;
816              break;              break;
817          case 't':          case 't':
818              result = 0x9;              result = 0x9;
819              break;              break;
820          case 'v':          case 'v':
821              result = 0xb;              result = 0xb;
822              break;              break;
823          case 'x':          case 'x':
824              if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */              if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
825                  char *end;                  char *end;
826    
827                  result = (unsigned char) strtoul(p+1, &end, 16);                  result = (unsigned char) strtoul(p+1, &end, 16);
828                  count = end - src;                  count = end - src;
829              } else {              } else {
830                  count = 2;                  count = 2;
831                  result = 'x';                  result = 'x';
832              }              }
833              break;              break;
834          case 'u':          case 'u':
835              result = 0;              result = 0;
836              for (count = 0; count < 4; count++) {              for (count = 0; count < 4; count++) {
837                  p++;                  p++;
838                  if (!isxdigit(UCHAR(*p))) { /* INTL: digit */                  if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
839                      break;                      break;
840                  }                  }
841                  n = *p - '0';                  n = *p - '0';
842                  if (n > 9) {                  if (n > 9) {
843                      n = n + '0' + 10 - 'A';                      n = n + '0' + 10 - 'A';
844                  }                  }
845                  if (n > 16) {                  if (n > 16) {
846                      n = n + 'A' - 'a';                      n = n + 'A' - 'a';
847                  }                  }
848                  result = (result << 4) + n;                  result = (result << 4) + n;
849              }              }
850              if (count == 0) {              if (count == 0) {
851                  result = 'u';                  result = 'u';
852              }              }
853              count += 2;              count += 2;
854              break;              break;
855                                            
856          case '\n':          case '\n':
857              do {              do {
858                  p++;                  p++;
859              } while ((*p == ' ') || (*p == '\t'));              } while ((*p == ' ') || (*p == '\t'));
860              result = ' ';              result = ' ';
861              count = p - src;              count = p - src;
862              break;              break;
863          case 0:          case 0:
864              result = '\\';              result = '\\';
865              count = 1;              count = 1;
866              break;              break;
867          default:          default:
868              /*              /*
869               * Check for an octal number \oo?o?               * Check for an octal number \oo?o?
870               */               */
871              if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */              if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
872                  result = (unsigned char)(*p - '0');                  result = (unsigned char)(*p - '0');
873                  p++;                  p++;
874                  if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */                  if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
875                      break;                      break;
876                  }                  }
877                  count = 3;                  count = 3;
878                  result = (unsigned char)((result << 3) + (*p - '0'));                  result = (unsigned char)((result << 3) + (*p - '0'));
879                  p++;                  p++;
880                  if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */                  if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
881                      break;                      break;
882                  }                  }
883                  count = 4;                  count = 4;
884                  result = (unsigned char)((result << 3) + (*p - '0'));                  result = (unsigned char)((result << 3) + (*p - '0'));
885                  break;                  break;
886              }              }
887              result = *p;              result = *p;
888              count = 2;              count = 2;
889              break;              break;
890      }      }
891    
892      if (readPtr != NULL) {      if (readPtr != NULL) {
893          *readPtr = count;          *readPtr = count;
894      }      }
895      return Tcl_UniCharToUtf(result, dst);      return Tcl_UniCharToUtf(result, dst);
896  }  }
897    
898  /*  /*
899   *----------------------------------------------------------------------   *----------------------------------------------------------------------
900   *   *
901   * Tcl_UtfToUpper --   * Tcl_UtfToUpper --
902   *   *
903   *      Convert lowercase characters to uppercase characters in a UTF   *      Convert lowercase characters to uppercase characters in a UTF
904   *      string in place.  The conversion may shrink the UTF string.   *      string in place.  The conversion may shrink the UTF string.
905   *   *
906   * Results:   * Results:
907   *      Returns the number of bytes in the resulting string   *      Returns the number of bytes in the resulting string
908   *      excluding the trailing null.   *      excluding the trailing null.
909   *   *
910   * Side effects:   * Side effects:
911   *      Writes a terminating null after the last converted character.   *      Writes a terminating null after the last converted character.
912   *   *
913   *----------------------------------------------------------------------   *----------------------------------------------------------------------
914   */   */
915    
916  int  int
917  Tcl_UtfToUpper(str)  Tcl_UtfToUpper(str)
918      char *str;                  /* String to convert in place. */      char *str;                  /* String to convert in place. */
919  {  {
920      Tcl_UniChar ch, upChar;      Tcl_UniChar ch, upChar;
921      char *src, *dst;      char *src, *dst;
922      int bytes;      int bytes;
923    
924      /*      /*
925       * Iterate over the string until we hit the terminating null.       * Iterate over the string until we hit the terminating null.
926       */       */
927    
928      src = dst = str;      src = dst = str;
929      while (*src) {      while (*src) {
930          bytes = Tcl_UtfToUniChar(src, &ch);          bytes = Tcl_UtfToUniChar(src, &ch);
931          upChar = Tcl_UniCharToUpper(ch);          upChar = Tcl_UniCharToUpper(ch);
932    
933          /*          /*
934           * To keep badly formed Utf strings from getting inflated by           * To keep badly formed Utf strings from getting inflated by
935           * the conversion (thereby causing a segfault), only copy the           * the conversion (thereby causing a segfault), only copy the
936           * upper case char to dst if its size is <= the original char.           * upper case char to dst if its size is <= the original char.
937           */           */
938                    
939          if (bytes < UtfCount(upChar)) {          if (bytes < UtfCount(upChar)) {
940              memcpy(dst, src, (size_t) bytes);              memcpy(dst, src, (size_t) bytes);
941              dst += bytes;              dst += bytes;
942          } else {          } else {
943              dst += Tcl_UniCharToUtf(upChar, dst);              dst += Tcl_UniCharToUtf(upChar, dst);
944          }          }
945          src += bytes;          src += bytes;
946      }      }
947      *dst = '\0';      *dst = '\0';
948      return (dst - str);      return (dst - str);
949  }  }
950    
951  /*  /*
952   *----------------------------------------------------------------------   *----------------------------------------------------------------------
953   *   *
954   * Tcl_UtfToLower --   * Tcl_UtfToLower --
955   *   *
956   *      Convert uppercase characters to lowercase characters in a UTF   *      Convert uppercase characters to lowercase characters in a UTF
957   *      string in place.  The conversion may shrink the UTF string.   *      string in place.  The conversion may shrink the UTF string.
958   *   *
959   * Results:   * Results:
960   *      Returns the number of bytes in the resulting string   *      Returns the number of bytes in the resulting string
961   *      excluding the trailing null.   *      excluding the trailing null.
962   *   *
963   * Side effects:   * Side effects:
964   *      Writes a terminating null after the last converted character.   *      Writes a terminating null after the last converted character.
965   *   *
966   *----------------------------------------------------------------------   *----------------------------------------------------------------------
967   */   */
968    
969  int  int
970  Tcl_UtfToLower(str)  Tcl_UtfToLower(str)
971      char *str;                  /* String to convert in place. */      char *str;                  /* String to convert in place. */
972  {  {
973      Tcl_UniChar ch, lowChar;      Tcl_UniChar ch, lowChar;
974      char *src, *dst;      char *src, *dst;
975      int bytes;      int bytes;
976            
977      /*      /*
978       * Iterate over the string until we hit the terminating null.       * Iterate over the string until we hit the terminating null.
979       */       */
980    
981      src = dst = str;      src = dst = str;
982      while (*src) {      while (*src) {
983          bytes = Tcl_UtfToUniChar(src, &ch);          bytes = Tcl_UtfToUniChar(src, &ch);
984          lowChar = Tcl_UniCharToLower(ch);          lowChar = Tcl_UniCharToLower(ch);
985    
986          /*          /*
987           * To keep badly formed Utf strings from getting inflated by           * To keep badly formed Utf strings from getting inflated by
988           * the conversion (thereby causing a segfault), only copy the           * the conversion (thereby causing a segfault), only copy the
989           * lower case char to dst if its size is <= the original char.           * lower case char to dst if its size is <= the original char.
990           */           */
991                    
992          if (bytes < UtfCount(lowChar)) {          if (bytes < UtfCount(lowChar)) {
993              memcpy(dst, src, (size_t) bytes);              memcpy(dst, src, (size_t) bytes);
994              dst += bytes;              dst += bytes;
995          } else {          } else {
996              dst += Tcl_UniCharToUtf(lowChar, dst);              dst += Tcl_UniCharToUtf(lowChar, dst);
997          }          }
998          src += bytes;          src += bytes;
999      }      }
1000      *dst = '\0';      *dst = '\0';
1001      return (dst - str);      return (dst - str);
1002  }  }
1003    
1004  /*  /*
1005   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1006   *   *
1007   * Tcl_UtfToTitle --   * Tcl_UtfToTitle --
1008   *   *
1009   *      Changes the first character of a UTF string to title case or   *      Changes the first character of a UTF string to title case or
1010   *      uppercase and the rest of the string to lowercase.  The   *      uppercase and the rest of the string to lowercase.  The
1011   *      conversion happens in place and may shrink the UTF string.   *      conversion happens in place and may shrink the UTF string.
1012   *   *
1013   * Results:   * Results:
1014   *      Returns the number of bytes in the resulting string   *      Returns the number of bytes in the resulting string
1015   *      excluding the trailing null.   *      excluding the trailing null.
1016   *   *
1017   * Side effects:   * Side effects:
1018   *      Writes a terminating null after the last converted character.   *      Writes a terminating null after the last converted character.
1019   *   *
1020   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1021   */   */
1022    
1023  int  int
1024  Tcl_UtfToTitle(str)  Tcl_UtfToTitle(str)
1025      char *str;                  /* String to convert in place. */      char *str;                  /* String to convert in place. */
1026  {  {
1027      Tcl_UniChar ch, titleChar, lowChar;      Tcl_UniChar ch, titleChar, lowChar;
1028      char *src, *dst;      char *src, *dst;
1029      int bytes;      int bytes;
1030            
1031      /*      /*
1032       * Capitalize the first character and then lowercase the rest of the       * Capitalize the first character and then lowercase the rest of the
1033       * characters until we get to a null.       * characters until we get to a null.
1034       */       */
1035    
1036      src = dst = str;      src = dst = str;
1037    
1038      if (*src) {      if (*src) {
1039          bytes = Tcl_UtfToUniChar(src, &ch);          bytes = Tcl_UtfToUniChar(src, &ch);
1040          titleChar = Tcl_UniCharToTitle(ch);          titleChar = Tcl_UniCharToTitle(ch);
1041    
1042          if (bytes < UtfCount(titleChar)) {          if (bytes < UtfCount(titleChar)) {
1043              memcpy(dst, src, (size_t) bytes);              memcpy(dst, src, (size_t) bytes);
1044              dst += bytes;              dst += bytes;
1045          } else {          } else {
1046              dst += Tcl_UniCharToUtf(titleChar, dst);              dst += Tcl_UniCharToUtf(titleChar, dst);
1047          }          }
1048          src += bytes;          src += bytes;
1049      }      }
1050      while (*src) {      while (*src) {
1051          bytes = Tcl_UtfToUniChar(src, &ch);          bytes = Tcl_UtfToUniChar(src, &ch);
1052          lowChar = Tcl_UniCharToLower(ch);          lowChar = Tcl_UniCharToLower(ch);
1053    
1054          if (bytes < UtfCount(lowChar)) {          if (bytes < UtfCount(lowChar)) {
1055              memcpy(dst, src, (size_t) bytes);              memcpy(dst, src, (size_t) bytes);
1056              dst += bytes;              dst += bytes;
1057          } else {          } else {
1058              dst += Tcl_UniCharToUtf(lowChar, dst);              dst += Tcl_UniCharToUtf(lowChar, dst);
1059          }          }
1060          src += bytes;          src += bytes;
1061      }      }
1062      *dst = '\0';      *dst = '\0';
1063      return (dst - str);      return (dst - str);
1064  }  }
1065    
1066  /*  /*
1067   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1068   *   *
1069   * Tcl_UtfNcmp --   * Tcl_UtfNcmp --
1070   *   *
1071   *      Compare at most n UTF chars of string cs to string ct.  Both cs   *      Compare at most n UTF chars of string cs to string ct.  Both cs
1072   *      and ct are assumed to be at least n UTF chars long.   *      and ct are assumed to be at least n UTF chars long.
1073   *   *
1074   * Results:   * Results:
1075   *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.   *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1076   *   *
1077   * Side effects:   * Side effects:
1078   *      None.   *      None.
1079   *   *
1080   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1081   */   */
1082    
1083  int  int
1084  Tcl_UtfNcmp(cs, ct, n)  Tcl_UtfNcmp(cs, ct, n)
1085      CONST char *cs;             /* UTF string to compare to ct. */      CONST char *cs;             /* UTF string to compare to ct. */
1086      CONST char *ct;             /* UTF string cs is compared to. */      CONST char *ct;             /* UTF string cs is compared to. */
1087      unsigned long n;            /* Number of UTF chars to compare. */      unsigned long n;            /* Number of UTF chars to compare. */
1088  {  {
1089      Tcl_UniChar ch1, ch2;      Tcl_UniChar ch1, ch2;
1090      /*      /*
1091       * Another approach that should work is:       * Another approach that should work is:
1092       *   return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));       *   return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
1093       * That assumes that ct is a properly formed UTF, so we will just       * That assumes that ct is a properly formed UTF, so we will just
1094       * be comparing the bytes that compromise those strings to the       * be comparing the bytes that compromise those strings to the
1095       * char length n.       * char length n.
1096       */       */
1097      while (n-- > 0) {      while (n-- > 0) {
1098          /*          /*
1099           * n must be interpreted as chars, not bytes.           * n must be interpreted as chars, not bytes.
1100           * This should be called only when both strings are of           * This should be called only when both strings are of
1101           * at least n chars long (no need for \0 check)           * at least n chars long (no need for \0 check)
1102           */           */
1103          cs += Tcl_UtfToUniChar(cs, &ch1);          cs += Tcl_UtfToUniChar(cs, &ch1);
1104          ct += Tcl_UtfToUniChar(ct, &ch2);          ct += Tcl_UtfToUniChar(ct, &ch2);
1105          if (ch1 != ch2) {          if (ch1 != ch2) {
1106              return (ch1 - ch2);              return (ch1 - ch2);
1107          }          }
1108      }      }
1109      return 0;      return 0;
1110  }  }
1111    
1112  /*  /*
1113   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1114   *   *
1115   * Tcl_UtfNcasecmp --   * Tcl_UtfNcasecmp --
1116   *   *
1117   *      Compare at most n UTF chars of string cs to string ct case   *      Compare at most n UTF chars of string cs to string ct case
1118   *      insensitive.  Both cs and ct are assumed to be at least n   *      insensitive.  Both cs and ct are assumed to be at least n
1119   *      UTF chars long.   *      UTF chars long.
1120   *   *
1121   * Results:   * Results:
1122   *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.   *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1123   *   *
1124   * Side effects:   * Side effects:
1125   *      None.   *      None.
1126   *   *
1127   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1128   */   */
1129    
1130  int  int
1131  Tcl_UtfNcasecmp(cs, ct, n)  Tcl_UtfNcasecmp(cs, ct, n)
1132      CONST char *cs;             /* UTF string to compare to ct. */      CONST char *cs;             /* UTF string to compare to ct. */
1133      CONST char *ct;             /* UTF string cs is compared to. */      CONST char *ct;             /* UTF string cs is compared to. */
1134      unsigned long n;                    /* Number of UTF chars to compare. */      unsigned long n;                    /* Number of UTF chars to compare. */
1135  {  {
1136      Tcl_UniChar ch1, ch2;      Tcl_UniChar ch1, ch2;
1137      while (n-- > 0) {      while (n-- > 0) {
1138          /*          /*
1139           * n must be interpreted as chars, not bytes.           * n must be interpreted as chars, not bytes.
1140           * This should be called only when both strings are of           * This should be called only when both strings are of
1141           * at least n chars long (no need for \0 check)           * at least n chars long (no need for \0 check)
1142           */           */
1143          cs += Tcl_UtfToUniChar(cs, &ch1);          cs += Tcl_UtfToUniChar(cs, &ch1);
1144          ct += Tcl_UtfToUniChar(ct, &ch2);          ct += Tcl_UtfToUniChar(ct, &ch2);
1145          if (ch1 != ch2) {          if (ch1 != ch2) {
1146              ch1 = Tcl_UniCharToLower(ch1);              ch1 = Tcl_UniCharToLower(ch1);
1147              ch2 = Tcl_UniCharToLower(ch2);              ch2 = Tcl_UniCharToLower(ch2);
1148              if (ch1 != ch2) {              if (ch1 != ch2) {
1149                  return (ch1 - ch2);                  return (ch1 - ch2);
1150              }              }
1151          }          }
1152      }      }
1153      return 0;      return 0;
1154  }  }
1155    
1156  /*  /*
1157   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1158   *   *
1159   * Tcl_UniCharToUpper --   * Tcl_UniCharToUpper --
1160   *   *
1161   *      Compute the uppercase equivalent of the given Unicode character.   *      Compute the uppercase equivalent of the given Unicode character.
1162   *   *
1163   * Results:   * Results:
1164   *      Returns the uppercase Unicode character.   *      Returns the uppercase Unicode character.
1165   *   *
1166   * Side effects:   * Side effects:
1167   *      None.   *      None.
1168   *   *
1169   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1170   */   */
1171    
1172  Tcl_UniChar  Tcl_UniChar
1173  Tcl_UniCharToUpper(ch)  Tcl_UniCharToUpper(ch)
1174      int ch;                     /* Unicode character to convert. */      int ch;                     /* Unicode character to convert. */
1175  {  {
1176      int info = GetUniCharInfo(ch);      int info = GetUniCharInfo(ch);
1177    
1178      if (GetCaseType(info) & 0x04) {      if (GetCaseType(info) & 0x04) {
1179          return (Tcl_UniChar) (ch - GetDelta(info));          return (Tcl_UniChar) (ch - GetDelta(info));
1180      } else {      } else {
1181          return ch;          return ch;
1182      }      }
1183  }  }
1184    
1185  /*  /*
1186   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1187   *   *
1188   * Tcl_UniCharToLower --   * Tcl_UniCharToLower --
1189   *   *
1190   *      Compute the lowercase equivalent of the given Unicode character.   *      Compute the lowercase equivalent of the given Unicode character.
1191   *   *
1192   * Results:   * Results:
1193   *      Returns the lowercase Unicode character.   *      Returns the lowercase Unicode character.
1194   *   *
1195   * Side effects:   * Side effects:
1196   *      None.   *      None.
1197   *   *
1198   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1199   */   */
1200    
1201  Tcl_UniChar  Tcl_UniChar
1202  Tcl_UniCharToLower(ch)  Tcl_UniCharToLower(ch)
1203      int ch;                     /* Unicode character to convert. */      int ch;                     /* Unicode character to convert. */
1204  {  {
1205      int info = GetUniCharInfo(ch);      int info = GetUniCharInfo(ch);
1206    
1207      if (GetCaseType(info) & 0x02) {      if (GetCaseType(info) & 0x02) {
1208          return (Tcl_UniChar) (ch + GetDelta(info));          return (Tcl_UniChar) (ch + GetDelta(info));
1209      } else {      } else {
1210          return ch;          return ch;
1211      }      }
1212  }  }
1213    
1214  /*  /*
1215   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1216   *   *
1217   * Tcl_UniCharToTitle --   * Tcl_UniCharToTitle --
1218   *   *
1219   *      Compute the titlecase equivalent of the given Unicode character.   *      Compute the titlecase equivalent of the given Unicode character.
1220   *   *
1221   * Results:   * Results:
1222   *      Returns the titlecase Unicode character.   *      Returns the titlecase Unicode character.
1223   *   *
1224   * Side effects:   * Side effects:
1225   *      None.   *      None.
1226   *   *
1227   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1228   */   */
1229    
1230  Tcl_UniChar  Tcl_UniChar
1231  Tcl_UniCharToTitle(ch)  Tcl_UniCharToTitle(ch)
1232      int ch;                     /* Unicode character to convert. */      int ch;                     /* Unicode character to convert. */
1233  {  {
1234      int info = GetUniCharInfo(ch);      int info = GetUniCharInfo(ch);
1235      int mode = GetCaseType(info);      int mode = GetCaseType(info);
1236    
1237      if (mode & 0x1) {      if (mode & 0x1) {
1238          /*          /*
1239           * Subtract or add one depending on the original case.           * Subtract or add one depending on the original case.
1240           */           */
1241    
1242          return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));          return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1243      } else if (mode == 0x4) {      } else if (mode == 0x4) {
1244          return (Tcl_UniChar) (ch - GetDelta(info));          return (Tcl_UniChar) (ch - GetDelta(info));
1245      } else {      } else {
1246          return ch;          return ch;
1247      }      }
1248  }  }
1249    
1250  /*  /*
1251   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1252   *   *
1253   * Tcl_UniCharLen --   * Tcl_UniCharLen --
1254   *   *
1255   *      Find the length of a UniChar string.  The str input must be null   *      Find the length of a UniChar string.  The str input must be null
1256   *      terminated.   *      terminated.
1257   *   *
1258   * Results:   * Results:
1259   *      Returns the length of str in UniChars (not bytes).   *      Returns the length of str in UniChars (not bytes).
1260   *   *
1261   * Side effects:   * Side effects:
1262   *      None.   *      None.
1263   *   *
1264   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1265   */   */
1266    
1267  int  int
1268  Tcl_UniCharLen(str)  Tcl_UniCharLen(str)
1269      Tcl_UniChar *str;           /* Unicode string to find length of. */      Tcl_UniChar *str;           /* Unicode string to find length of. */
1270  {  {
1271      int len = 0;      int len = 0;
1272            
1273      while (*str != '\0') {      while (*str != '\0') {
1274          len++;          len++;
1275          str++;          str++;
1276      }      }
1277      return len;      return len;
1278  }  }
1279    
1280  /*  /*
1281   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1282   *   *
1283   * Tcl_UniCharNcmp --   * Tcl_UniCharNcmp --
1284   *   *
1285   *      Compare at most n unichars of string cs to string ct.  Both cs   *      Compare at most n unichars of string cs to string ct.  Both cs
1286   *      and ct are assumed to be at least n unichars long.   *      and ct are assumed to be at least n unichars long.
1287   *   *
1288   * Results:   * Results:
1289   *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.   *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1290   *   *
1291   * Side effects:   * Side effects:
1292   *      None.   *      None.
1293   *   *
1294   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1295   */   */
1296    
1297  int  int
1298  Tcl_UniCharNcmp(cs, ct, n)  Tcl_UniCharNcmp(cs, ct, n)
1299      CONST Tcl_UniChar *cs;              /* Unicode string to compare to ct. */      CONST Tcl_UniChar *cs;              /* Unicode string to compare to ct. */
1300      CONST Tcl_UniChar *ct;              /* Unicode string cs is compared to. */      CONST Tcl_UniChar *ct;              /* Unicode string cs is compared to. */
1301      unsigned long n;                    /* Number of unichars to compare. */      unsigned long n;                    /* Number of unichars to compare. */
1302  {  {
1303      for ( ; n != 0; n--, cs++, ct++) {      for ( ; n != 0; n--, cs++, ct++) {
1304          if (*cs != *ct) {          if (*cs != *ct) {
1305              return *cs - *ct;              return *cs - *ct;
1306          }          }
1307          if (*cs == '\0') {          if (*cs == '\0') {
1308              break;              break;
1309          }          }
1310      }      }
1311      return 0;      return 0;
1312  }  }
1313    
1314  /*  /*
1315   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1316   *   *
1317   * Tcl_UniCharIsAlnum --   * Tcl_UniCharIsAlnum --
1318   *   *
1319   *      Test if a character is an alphanumeric Unicode character.   *      Test if a character is an alphanumeric Unicode character.
1320   *   *
1321   * Results:   * Results:
1322   *      Returns 1 if character is alphanumeric.   *      Returns 1 if character is alphanumeric.
1323   *   *
1324   * Side effects:   * Side effects:
1325   *      None.   *      None.
1326   *   *
1327   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1328   */   */
1329    
1330  int  int
1331  Tcl_UniCharIsAlnum(ch)  Tcl_UniCharIsAlnum(ch)
1332      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1333  {  {
1334      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1335    
1336      return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);      return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1337  }  }
1338    
1339  /*  /*
1340   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1341   *   *
1342   * Tcl_UniCharIsAlpha --   * Tcl_UniCharIsAlpha --
1343   *   *
1344   *      Test if a character is an alphabetic Unicode character.   *      Test if a character is an alphabetic Unicode character.
1345   *   *
1346   * Results:   * Results:
1347   *      Returns 1 if character is alphabetic.   *      Returns 1 if character is alphabetic.
1348   *   *
1349   * Side effects:   * Side effects:
1350   *      None.   *      None.
1351   *   *
1352   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1353   */   */
1354    
1355  int  int
1356  Tcl_UniCharIsAlpha(ch)  Tcl_UniCharIsAlpha(ch)
1357      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1358  {  {
1359      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1360      return ((ALPHA_BITS >> category) & 1);      return ((ALPHA_BITS >> category) & 1);
1361  }  }
1362    
1363  /*  /*
1364   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1365   *   *
1366   * Tcl_UniCharIsControl --   * Tcl_UniCharIsControl --
1367   *   *
1368   *      Test if a character is a Unicode control character.   *      Test if a character is a Unicode control character.
1369   *   *
1370   * Results:   * Results:
1371   *      Returns non-zero if character is a control.   *      Returns non-zero if character is a control.
1372   *   *
1373   * Side effects:   * Side effects:
1374   *      None.   *      None.
1375   *   *
1376   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1377   */   */
1378    
1379  int  int
1380  Tcl_UniCharIsControl(ch)  Tcl_UniCharIsControl(ch)
1381      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1382  {  {
1383      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1384  }  }
1385    
1386  /*  /*
1387   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1388   *   *
1389   * Tcl_UniCharIsDigit --   * Tcl_UniCharIsDigit --
1390   *   *
1391   *      Test if a character is a numeric Unicode character.   *      Test if a character is a numeric Unicode character.
1392   *   *
1393   * Results:   * Results:
1394   *      Returns non-zero if character is a digit.   *      Returns non-zero if character is a digit.
1395   *   *
1396   * Side effects:   * Side effects:
1397   *      None.   *      None.
1398   *   *
1399   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1400   */   */
1401    
1402  int  int
1403  Tcl_UniCharIsDigit(ch)  Tcl_UniCharIsDigit(ch)
1404      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1405  {  {
1406      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1407              == DECIMAL_DIGIT_NUMBER);              == DECIMAL_DIGIT_NUMBER);
1408  }  }
1409    
1410  /*  /*
1411   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1412   *   *
1413   * Tcl_UniCharIsGraph --   * Tcl_UniCharIsGraph --
1414   *   *
1415   *      Test if a character is any Unicode print character except space.   *      Test if a character is any Unicode print character except space.
1416   *   *
1417   * Results:   * Results:
1418   *      Returns non-zero if character is printable, but not space.   *      Returns non-zero if character is printable, but not space.
1419   *   *
1420   * Side effects:   * Side effects:
1421   *      None.   *      None.
1422   *   *
1423   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1424   */   */
1425    
1426  int  int
1427  Tcl_UniCharIsGraph(ch)  Tcl_UniCharIsGraph(ch)
1428      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1429  {  {
1430      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1431      return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));      return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1432  }  }
1433    
1434  /*  /*
1435   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1436   *   *
1437   * Tcl_UniCharIsLower --   * Tcl_UniCharIsLower --
1438   *   *
1439   *      Test if a character is a lowercase Unicode character.   *      Test if a character is a lowercase Unicode character.
1440   *   *
1441   * Results:   * Results:
1442   *      Returns non-zero if character is lowercase.   *      Returns non-zero if character is lowercase.
1443   *   *
1444   * Side effects:   * Side effects:
1445   *      None.   *      None.
1446   *   *
1447   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1448   */   */
1449    
1450  int  int
1451  Tcl_UniCharIsLower(ch)  Tcl_UniCharIsLower(ch)
1452      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1453  {  {
1454      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1455  }  }
1456    
1457  /*  /*
1458   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1459   *   *
1460   * Tcl_UniCharIsPrint --   * Tcl_UniCharIsPrint --
1461   *   *
1462   *      Test if a character is a Unicode print character.   *      Test if a character is a Unicode print character.
1463   *   *
1464   * Results:   * Results:
1465   *      Returns non-zero if character is printable.   *      Returns non-zero if character is printable.
1466   *   *
1467   * Side effects:   * Side effects:
1468   *      None.   *      None.
1469   *   *
1470   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1471   */   */
1472    
1473  int  int
1474  Tcl_UniCharIsPrint(ch)  Tcl_UniCharIsPrint(ch)
1475      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1476  {  {
1477      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1478      return ((PRINT_BITS >> category) & 1);      return ((PRINT_BITS >> category) & 1);
1479  }  }
1480    
1481  /*  /*
1482   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1483   *   *
1484   * Tcl_UniCharIsPunct --   * Tcl_UniCharIsPunct --
1485   *   *
1486   *      Test if a character is a Unicode punctuation character.   *      Test if a character is a Unicode punctuation character.
1487   *   *
1488   * Results:   * Results:
1489   *      Returns non-zero if character is punct.   *      Returns non-zero if character is punct.
1490   *   *
1491   * Side effects:   * Side effects:
1492   *      None.   *      None.
1493   *   *
1494   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1495   */   */
1496    
1497  int  int
1498  Tcl_UniCharIsPunct(ch)  Tcl_UniCharIsPunct(ch)
1499      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1500  {  {
1501      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1502      return ((PUNCT_BITS >> category) & 1);      return ((PUNCT_BITS >> category) & 1);
1503  }  }
1504    
1505  /*  /*
1506   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1507   *   *
1508   * Tcl_UniCharIsSpace --   * Tcl_UniCharIsSpace --
1509   *   *
1510   *      Test if a character is a whitespace Unicode character.   *      Test if a character is a whitespace Unicode character.
1511   *   *
1512   * Results:   * Results:
1513   *      Returns non-zero if character is a space.   *      Returns non-zero if character is a space.
1514   *   *
1515   * Side effects:   * Side effects:
1516   *      None.   *      None.
1517   *   *
1518   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1519   */   */
1520    
1521  int  int
1522  Tcl_UniCharIsSpace(ch)  Tcl_UniCharIsSpace(ch)
1523      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1524  {  {
1525      register int category;      register int category;
1526    
1527      /*      /*
1528       * If the character is within the first 127 characters, just use the       * If the character is within the first 127 characters, just use the
1529       * standard C function, otherwise consult the Unicode table.       * standard C function, otherwise consult the Unicode table.
1530       */       */
1531    
1532      if (ch < 0x80) {      if (ch < 0x80) {
1533          return isspace(UCHAR(ch)); /* INTL: ISO space */          return isspace(UCHAR(ch)); /* INTL: ISO space */
1534      } else {      } else {
1535          category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);          category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1536          return ((SPACE_BITS >> category) & 1);          return ((SPACE_BITS >> category) & 1);
1537      }      }
1538  }  }
1539    
1540  /*  /*
1541   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1542   *   *
1543   * Tcl_UniCharIsUpper --   * Tcl_UniCharIsUpper --
1544   *   *
1545   *      Test if a character is a uppercase Unicode character.   *      Test if a character is a uppercase Unicode character.
1546   *   *
1547   * Results:   * Results:
1548   *      Returns non-zero if character is uppercase.   *      Returns non-zero if character is uppercase.
1549   *   *
1550   * Side effects:   * Side effects:
1551   *      None.   *      None.
1552   *   *
1553   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1554   */   */
1555    
1556  int  int
1557  Tcl_UniCharIsUpper(ch)  Tcl_UniCharIsUpper(ch)
1558      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1559  {  {
1560      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);      return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1561  }  }
1562    
1563  /*  /*
1564   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1565   *   *
1566   * Tcl_UniCharIsWordChar --   * Tcl_UniCharIsWordChar --
1567   *   *
1568   *      Test if a character is alphanumeric or a connector punctuation   *      Test if a character is alphanumeric or a connector punctuation
1569   *      mark.   *      mark.
1570   *   *
1571   * Results:   * Results:
1572   *      Returns 1 if character is a word character.   *      Returns 1 if character is a word character.
1573   *   *
1574   * Side effects:   * Side effects:
1575   *      None.   *      None.
1576   *   *
1577   *----------------------------------------------------------------------   *----------------------------------------------------------------------
1578   */   */
1579    
1580  int  int
1581  Tcl_UniCharIsWordChar(ch)  Tcl_UniCharIsWordChar(ch)
1582      int ch;                     /* Unicode character to test. */      int ch;                     /* Unicode character to test. */
1583  {  {
1584      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);      register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1585    
1586      return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);      return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1587  }  }
1588    
1589  /* End of tclutf.c */  /* End of tclutf.c */

Legend:
Removed from v.67  
changed lines
  Added in v.71

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25