/[dtapublic]/projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c
ViewVC logotype

Diff of /projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

sf_code/esrgpcpj/shared/tcl_base/tclutf.c revision 25 by dashley, Sat Oct 8 06:43:03 2016 UTC projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c revision 71 by dashley, Sat Nov 5 11:07:06 2016 UTC
# Line 1  Line 1 
 /* $Header: /cvsroot/esrg/sfesrg/esrgpcpj/shared/tcl_base/tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $ */  
   
 /*  
  * tclUtf.c --  
  *  
  *      Routines for manipulating UTF-8 strings.  
  *  
  * Copyright (c) 1997-1998 Sun Microsystems, Inc.  
  *  
  * See the file "license.terms" for information on usage and redistribution  
  * of this file, and for a DISCLAIMER OF ALL WARRANTIES.  
  *  
  * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $  
  */  
   
 #include "tclInt.h"  
   
 /*  
  * Include the static character classification tables and macros.  
  */  
   
 #include "tclUniData.c"  
   
 /*  
  * The following macros are used for fast character category tests.  The  
  * x_BITS values are shifted right by the category value to determine whether  
  * the given category is included in the set.  
  */  
   
 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \  
     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))  
   
 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)  
   
 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \  
     | (1 << PARAGRAPH_SEPARATOR))  
   
 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)  
   
 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \  
             (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \  
             (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \  
             (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \  
             (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \  
             (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \  
             (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \  
             (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \  
             (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))  
   
 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \  
             (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \  
             (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \  
             (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))  
   
 /*  
  * Unicode characters less than this value are represented by themselves  
  * in UTF-8 strings.  
  */  
   
 #define UNICODE_SELF    0x80  
   
 /*  
  * The following structures are used when mapping between Unicode (UCS-2)  
  * and UTF-8.  
  */  
   
 CONST unsigned char totalBytes[256] = {  
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,  
 #if TCL_UTF_MAX > 3  
     4,4,4,4,4,4,4,4,  
 #else  
     1,1,1,1,1,1,1,1,  
 #endif  
 #if TCL_UTF_MAX > 4  
     5,5,5,5,  
 #else  
     1,1,1,1,  
 #endif  
 #if TCL_UTF_MAX > 5  
     6,6,6,6  
 #else  
     1,1,1,1  
 #endif  
 };  
   
 /*  
  * Procedures used only in this module.  
  */  
   
 static int UtfCount _ANSI_ARGS_((int ch));  
   
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * UtfCount --  
  *  
  *      Find the number of bytes in the Utf character "ch".  
  *  
  * Results:  
  *      The return values is the number of bytes in the Utf character "ch".  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 static int  
 UtfCount(ch)  
     int ch;                     /* The Tcl_UniChar whose size is returned. */  
 {  
     if ((ch > 0) && (ch < UNICODE_SELF)) {  
         return 1;  
     }  
     if (ch <= 0x7FF) {  
         return 2;  
     }  
     if (ch <= 0xFFFF) {  
         return 3;  
     }  
 #if TCL_UTF_MAX > 3  
     if (ch <= 0x1FFFFF) {  
         return 4;  
     }  
     if (ch <= 0x3FFFFFF) {  
         return 5;  
     }  
     if (ch <= 0x7FFFFFFF) {  
         return 6;  
     }  
 #endif  
     return 3;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UniCharToUtf --  
  *  
  *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the  
  *      provided buffer.  Equivalent to Plan 9 runetochar().  
  *  
  * Results:  
  *      The return values is the number of bytes in the buffer that  
  *      were consumed.    
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 INLINE int  
 Tcl_UniCharToUtf(ch, str)  
     int ch;                     /* The Tcl_UniChar to be stored in the  
                                  * buffer. */  
     char *str;                  /* Buffer in which the UTF-8 representation  
                                  * of the Tcl_UniChar is stored.  Buffer must  
                                  * be large enough to hold the UTF-8 character  
                                  * (at most TCL_UTF_MAX bytes). */  
 {  
     if ((ch > 0) && (ch < UNICODE_SELF)) {  
         str[0] = (char) ch;  
         return 1;  
     }  
     if (ch <= 0x7FF) {  
         str[1] = (char) ((ch | 0x80) & 0xBF);  
         str[0] = (char) ((ch >> 6) | 0xC0);  
         return 2;  
     }  
     if (ch <= 0xFFFF) {  
         three:  
         str[2] = (char) ((ch | 0x80) & 0xBF);  
         str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);  
         str[0] = (char) ((ch >> 12) | 0xE0);  
         return 3;  
     }  
   
 #if TCL_UTF_MAX > 3  
     if (ch <= 0x1FFFFF) {  
         str[3] = (char) ((ch | 0x80) & 0xBF);  
         str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);  
         str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);  
         str[0] = (char) ((ch >> 18) | 0xF0);  
         return 4;  
     }  
     if (ch <= 0x3FFFFFF) {  
         str[4] = (char) ((ch | 0x80) & 0xBF);  
         str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);  
         str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);  
         str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);  
         str[0] = (char) ((ch >> 24) | 0xF8);  
         return 5;  
     }  
     if (ch <= 0x7FFFFFFF) {  
         str[5] = (char) ((ch | 0x80) & 0xBF);  
         str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);  
         str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);  
         str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);  
         str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);  
         str[0] = (char) ((ch >> 30) | 0xFC);  
         return 6;  
     }  
 #endif  
   
     ch = 0xFFFD;  
     goto three;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UniCharToUtfDString --  
  *  
  *      Convert the given Unicode string to UTF-8.  
  *  
  * Results:  
  *      The return value is a pointer to the UTF-8 representation of the  
  *      Unicode string.  Storage for the return value is appended to the  
  *      end of dsPtr.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 char *  
 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)  
     CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */  
     int numChars;               /* Length of Unicode string in Tcl_UniChars  
                                  * (must be >= 0). */  
     Tcl_DString *dsPtr;         /* UTF-8 representation of string is  
                                  * appended to this previously initialized  
                                  * DString. */  
 {  
     CONST Tcl_UniChar *w, *wEnd;  
     char *p, *string;  
     int oldLength;  
   
     /*  
      * UTF-8 string length in bytes will be <= Unicode string length *  
      * TCL_UTF_MAX.  
      */  
   
     oldLength = Tcl_DStringLength(dsPtr);  
     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);  
     string = Tcl_DStringValue(dsPtr) + oldLength;  
   
     p = string;  
     wEnd = wString + numChars;  
     for (w = wString; w < wEnd; ) {  
         p += Tcl_UniCharToUtf(*w, p);  
         w++;  
     }  
     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));  
   
     return string;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfToUniChar --  
  *  
  *      Extract the Tcl_UniChar represented by the UTF-8 string.  Bad  
  *      UTF-8 sequences are converted to valid Tcl_UniChars and processing  
  *      continues.  Equivalent to Plan 9 chartorune().  
  *  
  *      The caller must ensure that the source buffer is long enough that  
  *      this routine does not run off the end and dereference non-existent  
  *      memory looking for trail bytes.  If the source buffer is known to  
  *      be '\0' terminated, this cannot happen.  Otherwise, the caller  
  *      should call Tcl_UtfCharComplete() before calling this routine to  
  *      ensure that enough bytes remain in the string.  
  *  
  * Results:  
  *      *chPtr is filled with the Tcl_UniChar, and the return value is the  
  *      number of bytes from the UTF-8 string that were consumed.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfToUniChar(str, chPtr)  
     register CONST char *str;    /* The UTF-8 string. */  
     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented  
                                   * by the UTF-8 string. */  
 {  
     register int byte;  
       
     /*  
      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.  
      */  
   
     byte = *((unsigned char *) str);  
     if (byte < 0xC0) {  
         /*  
          * Handles properly formed UTF-8 characters between 0x01 and 0x7F.  
          * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid  
          * characters representing themselves.  
          */  
           
         *chPtr = (Tcl_UniChar) byte;  
         return 1;  
     } else if (byte < 0xE0) {  
         if ((str[1] & 0xC0) == 0x80) {  
             /*  
              * Two-byte-character lead-byte followed by a trail-byte.  
              */  
               
             *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));  
             return 2;  
         }  
         /*  
          * A two-byte-character lead-byte not followed by trail-byte  
          * represents itself.  
          */  
           
         *chPtr = (Tcl_UniChar) byte;  
         return 1;  
     } else if (byte < 0xF0) {  
         if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {  
             /*  
              * Three-byte-character lead byte followed by two trail bytes.  
              */  
   
             *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)  
                     | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));  
             return 3;  
         }  
         /*  
          * A three-byte-character lead-byte not followed by two trail-bytes  
          * represents itself.  
          */  
   
         *chPtr = (Tcl_UniChar) byte;  
         return 1;  
     }  
 #if TCL_UTF_MAX > 3  
     else {  
         int ch, total, trail;  
   
         total = totalBytes[byte];  
         trail = total - 1;  
         if (trail > 0) {  
             ch = byte & (0x3F >> trail);  
             do {  
                 str++;  
                 if ((*str & 0xC0) != 0x80) {  
                     *chPtr = byte;  
                     return 1;  
                 }  
                 ch <<= 6;  
                 ch |= (*str & 0x3F);  
                 trail--;  
             } while (trail > 0);  
             *chPtr = ch;  
             return total;  
         }  
     }  
 #endif  
   
     *chPtr = (Tcl_UniChar) byte;  
     return 1;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfToUniCharDString --  
  *  
  *      Convert the UTF-8 string to Unicode.  
  *  
  * Results:  
  *      The return value is a pointer to the Unicode representation of the  
  *      UTF-8 string.  Storage for the return value is appended to the  
  *      end of dsPtr.  The Unicode string is terminated with a Unicode  
  *      NULL character.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 Tcl_UniChar *  
 Tcl_UtfToUniCharDString(string, length, dsPtr)  
     CONST char *string;         /* UTF-8 string to convert to Unicode. */  
     int length;                 /* Length of UTF-8 string in bytes, or -1  
                                  * for strlen(). */  
     Tcl_DString *dsPtr;         /* Unicode representation of string is  
                                  * appended to this previously initialized  
                                  * DString. */  
 {  
     Tcl_UniChar *w, *wString;  
     CONST char *p, *end;  
     int oldLength;  
   
     if (length < 0) {  
         length = strlen(string);  
     }  
   
     /*  
      * Unicode string length in Tcl_UniChars will be <= UTF-8 string length  
      * in bytes.  
      */  
   
     oldLength = Tcl_DStringLength(dsPtr);  
     Tcl_DStringSetLength(dsPtr,  
             (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));  
     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);  
   
     w = wString;  
     end = string + length;  
     for (p = string; p < end; ) {  
         p += Tcl_UtfToUniChar(p, w);  
         w++;  
     }  
     *w = '\0';  
     Tcl_DStringSetLength(dsPtr,  
             (oldLength + ((char *) w - (char *) wString)));  
   
     return wString;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfCharComplete --  
  *  
  *      Determine if the UTF-8 string of the given length is long enough  
  *      to be decoded by Tcl_UtfToUniChar().  This does not ensure that the  
  *      UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().  
  *  
  * Results:  
  *      The return value is 0 if the string is not long enough, non-zero  
  *      otherwise.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfCharComplete(str, len)  
     CONST char *str;            /* String to check if first few bytes  
                                  * contain a complete UTF-8 character. */  
     int len;                    /* Length of above string in bytes. */  
 {  
     int ch;  
   
     ch = *((unsigned char *) str);  
     return len >= totalBytes[ch];  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_NumUtfChars --  
  *  
  *      Returns the number of characters (not bytes) in the UTF-8 string,  
  *      not including the terminating NULL byte.  This is equivalent to  
  *      Plan 9 utflen() and utfnlen().  
  *  
  * Results:  
  *      As above.    
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 int  
 Tcl_NumUtfChars(str, len)  
     register CONST char *str;   /* The UTF-8 string to measure. */  
     int len;                    /* The length of the string in bytes, or -1  
                                  * for strlen(string). */  
 {  
     Tcl_UniChar ch;  
     register Tcl_UniChar *chPtr = &ch;  
     register int n;  
     int i;  
   
     /*  
      * The separate implementations are faster.  
      */  
       
     i = 0;  
     if (len < 0) {  
         while (1) {  
             str += Tcl_UtfToUniChar(str, chPtr);  
             if (ch == '\0') {  
                 break;  
             }  
             i++;  
         }  
     } else {  
         while (len > 0) {  
             n = Tcl_UtfToUniChar(str, chPtr);  
             len -= n;  
             str += n;  
             i++;  
         }  
     }  
     return i;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfFindFirst --  
  *  
  *      Returns a pointer to the first occurance of the given Tcl_UniChar  
  *      in the NULL-terminated UTF-8 string.  The NULL terminator is  
  *      considered part of the UTF-8 string.  Equivalent to Plan 9  
  *      utfrune().  
  *  
  * Results:  
  *      As above.  If the Tcl_UniChar does not exist in the given string,  
  *      the return value is NULL.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
 char *  
 Tcl_UtfFindFirst(string, ch)  
     CONST char *string;         /* The UTF-8 string to be searched. */  
     int ch;                     /* The Tcl_UniChar to search for. */  
 {  
     int len;  
     Tcl_UniChar find;  
       
     while (1) {  
         len = Tcl_UtfToUniChar(string, &find);  
         if (find == ch) {  
             return (char *) string;  
         }  
         if (*string == '\0') {  
             return NULL;  
         }  
         string += len;  
     }  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfFindLast --  
  *  
  *      Returns a pointer to the last occurance of the given Tcl_UniChar  
  *      in the NULL-terminated UTF-8 string.  The NULL terminator is  
  *      considered part of the UTF-8 string.  Equivalent to Plan 9  
  *      utfrrune().  
  *  
  * Results:  
  *      As above.  If the Tcl_UniChar does not exist in the given string,  
  *      the return value is NULL.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 char *  
 Tcl_UtfFindLast(string, ch)  
     CONST char *string;         /* The UTF-8 string to be searched. */  
     int ch;                     /* The Tcl_UniChar to search for. */  
 {  
     int len;  
     Tcl_UniChar find;  
     CONST char *last;  
           
     last = NULL;  
     while (1) {  
         len = Tcl_UtfToUniChar(string, &find);  
         if (find == ch) {  
             last = string;  
         }  
         if (*string == '\0') {  
             break;  
         }  
         string += len;  
     }  
     return (char *) last;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfNext --  
  *  
  *      Given a pointer to some current location in a UTF-8 string,  
  *      move forward one character.  The caller must ensure that they  
  *      are not asking for the next character after the last character  
  *      in the string.  
  *  
  * Results:  
  *      The return value is the pointer to the next character in  
  *      the UTF-8 string.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 char *  
 Tcl_UtfNext(str)  
     CONST char *str;                /* The current location in the string. */  
 {  
     Tcl_UniChar ch;  
   
     return (char *) str + Tcl_UtfToUniChar(str, &ch);  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfPrev --  
  *  
  *      Given a pointer to some current location in a UTF-8 string,  
  *      move backwards one character.  
  *  
  * Results:  
  *      The return value is a pointer to the previous character in the  
  *      UTF-8 string.  If the current location was already at the  
  *      beginning of the string, the return value will also be a  
  *      pointer to the beginning of the string.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 char *  
 Tcl_UtfPrev(str, start)  
     CONST char *str;                /* The current location in the string. */  
     CONST char *start;              /* Pointer to the beginning of the  
                                      * string, to avoid going backwards too  
                                      * far. */  
 {  
     CONST char *look;  
     int i, byte;  
       
     str--;  
     look = str;  
     for (i = 0; i < TCL_UTF_MAX; i++) {  
         if (look < start) {  
             if (str < start) {  
                 str = start;  
             }  
             break;  
         }  
         byte = *((unsigned char *) look);  
         if (byte < 0x80) {  
             break;  
         }  
         if (byte >= 0xC0) {  
             if (totalBytes[byte] != i + 1) {  
                 break;  
             }  
             return (char *) look;  
         }  
         look--;  
     }  
     return (char *) str;  
 }  
         
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UniCharAtIndex --  
  *  
  *      Returns the Unicode character represented at the specified  
  *      character (not byte) position in the UTF-8 string.  
  *  
  * Results:  
  *      As above.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 Tcl_UniChar  
 Tcl_UniCharAtIndex(src, index)  
     register CONST char *src;   /* The UTF-8 string to dereference. */  
     register int index;         /* The position of the desired character. */  
 {  
     Tcl_UniChar ch;  
   
     while (index >= 0) {  
         index--;  
         src += Tcl_UtfToUniChar(src, &ch);  
     }  
     return ch;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfAtIndex --  
  *  
  *      Returns a pointer to the specified character (not byte) position  
  *      in the UTF-8 string.  
  *  
  * Results:  
  *      As above.  
  *  
  * Side effects:  
  *      None.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 char *  
 Tcl_UtfAtIndex(src, index)  
     register CONST char *src;   /* The UTF-8 string. */  
     register int index;         /* The position of the desired character. */  
 {  
     Tcl_UniChar ch;  
       
     while (index > 0) {  
         index--;  
         src += Tcl_UtfToUniChar(src, &ch);  
     }  
     return (char *) src;  
 }  
   
 /*  
  *---------------------------------------------------------------------------  
  *  
  * Tcl_UtfBackslash --  
  *  
  *      Figure out how to handle a backslash sequence.  
  *  
  * Results:  
  *      Stores the bytes represented by the backslash sequence in dst and  
  *      returns the number of bytes written to dst.  At most TCL_UTF_MAX  
  *      bytes are written to dst; dst must have been large enough to accept  
  *      those bytes.  If readPtr isn't NULL then it is filled in with a  
  *      count of the number of bytes in the backslash sequence.    
  *  
  * Side effects:  
  *      The maximum number of bytes it takes to represent a Unicode  
  *      character in UTF-8 is guaranteed to be less than the number of  
  *      bytes used to express the backslash sequence that represents  
  *      that Unicode character.  If the target buffer into which the  
  *      caller is going to store the bytes that represent the Unicode  
  *      character is at least as large as the source buffer from which  
  *      the backslashed sequence was extracted, no buffer overruns should  
  *      occur.  
  *  
  *---------------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfBackslash(src, readPtr, dst)  
     CONST char *src;            /* Points to the backslash character of  
                                  * a backslash sequence. */  
     int *readPtr;               /* Fill in with number of characters read  
                                  * from src, unless NULL. */  
     char *dst;                  /* Filled with the bytes represented by the  
                                  * backslash sequence. */  
 {  
     register CONST char *p = src+1;  
     int result, count, n;  
     char buf[TCL_UTF_MAX];  
   
     if (dst == NULL) {  
         dst = buf;  
     }  
   
     count = 2;  
     switch (*p) {  
         /*  
          * Note: in the conversions below, use absolute values (e.g.,  
          * 0xa) rather than symbolic values (e.g. \n) that get converted  
          * by the compiler.  It's possible that compilers on some  
          * platforms will do the symbolic conversions differently, which  
          * could result in non-portable Tcl scripts.  
          */  
   
         case 'a':  
             result = 0x7;  
             break;  
         case 'b':  
             result = 0x8;  
             break;  
         case 'f':  
             result = 0xc;  
             break;  
         case 'n':  
             result = 0xa;  
             break;  
         case 'r':  
             result = 0xd;  
             break;  
         case 't':  
             result = 0x9;  
             break;  
         case 'v':  
             result = 0xb;  
             break;  
         case 'x':  
             if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */  
                 char *end;  
   
                 result = (unsigned char) strtoul(p+1, &end, 16);  
                 count = end - src;  
             } else {  
                 count = 2;  
                 result = 'x';  
             }  
             break;  
         case 'u':  
             result = 0;  
             for (count = 0; count < 4; count++) {  
                 p++;  
                 if (!isxdigit(UCHAR(*p))) { /* INTL: digit */  
                     break;  
                 }  
                 n = *p - '0';  
                 if (n > 9) {  
                     n = n + '0' + 10 - 'A';  
                 }  
                 if (n > 16) {  
                     n = n + 'A' - 'a';  
                 }  
                 result = (result << 4) + n;  
             }  
             if (count == 0) {  
                 result = 'u';  
             }  
             count += 2;  
             break;  
                       
         case '\n':  
             do {  
                 p++;  
             } while ((*p == ' ') || (*p == '\t'));  
             result = ' ';  
             count = p - src;  
             break;  
         case 0:  
             result = '\\';  
             count = 1;  
             break;  
         default:  
             /*  
              * Check for an octal number \oo?o?  
              */  
             if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */  
                 result = (unsigned char)(*p - '0');  
                 p++;  
                 if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */  
                     break;  
                 }  
                 count = 3;  
                 result = (unsigned char)((result << 3) + (*p - '0'));  
                 p++;  
                 if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */  
                     break;  
                 }  
                 count = 4;  
                 result = (unsigned char)((result << 3) + (*p - '0'));  
                 break;  
             }  
             result = *p;  
             count = 2;  
             break;  
     }  
   
     if (readPtr != NULL) {  
         *readPtr = count;  
     }  
     return Tcl_UniCharToUtf(result, dst);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UtfToUpper --  
  *  
  *      Convert lowercase characters to uppercase characters in a UTF  
  *      string in place.  The conversion may shrink the UTF string.  
  *  
  * Results:  
  *      Returns the number of bytes in the resulting string  
  *      excluding the trailing null.  
  *  
  * Side effects:  
  *      Writes a terminating null after the last converted character.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfToUpper(str)  
     char *str;                  /* String to convert in place. */  
 {  
     Tcl_UniChar ch, upChar;  
     char *src, *dst;  
     int bytes;  
   
     /*  
      * Iterate over the string until we hit the terminating null.  
      */  
   
     src = dst = str;  
     while (*src) {  
         bytes = Tcl_UtfToUniChar(src, &ch);  
         upChar = Tcl_UniCharToUpper(ch);  
   
         /*  
          * To keep badly formed Utf strings from getting inflated by  
          * the conversion (thereby causing a segfault), only copy the  
          * upper case char to dst if its size is <= the original char.  
          */  
           
         if (bytes < UtfCount(upChar)) {  
             memcpy(dst, src, (size_t) bytes);  
             dst += bytes;  
         } else {  
             dst += Tcl_UniCharToUtf(upChar, dst);  
         }  
         src += bytes;  
     }  
     *dst = '\0';  
     return (dst - str);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UtfToLower --  
  *  
  *      Convert uppercase characters to lowercase characters in a UTF  
  *      string in place.  The conversion may shrink the UTF string.  
  *  
  * Results:  
  *      Returns the number of bytes in the resulting string  
  *      excluding the trailing null.  
  *  
  * Side effects:  
  *      Writes a terminating null after the last converted character.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfToLower(str)  
     char *str;                  /* String to convert in place. */  
 {  
     Tcl_UniChar ch, lowChar;  
     char *src, *dst;  
     int bytes;  
       
     /*  
      * Iterate over the string until we hit the terminating null.  
      */  
   
     src = dst = str;  
     while (*src) {  
         bytes = Tcl_UtfToUniChar(src, &ch);  
         lowChar = Tcl_UniCharToLower(ch);  
   
         /*  
          * To keep badly formed Utf strings from getting inflated by  
          * the conversion (thereby causing a segfault), only copy the  
          * lower case char to dst if its size is <= the original char.  
          */  
           
         if (bytes < UtfCount(lowChar)) {  
             memcpy(dst, src, (size_t) bytes);  
             dst += bytes;  
         } else {  
             dst += Tcl_UniCharToUtf(lowChar, dst);  
         }  
         src += bytes;  
     }  
     *dst = '\0';  
     return (dst - str);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UtfToTitle --  
  *  
  *      Changes the first character of a UTF string to title case or  
  *      uppercase and the rest of the string to lowercase.  The  
  *      conversion happens in place and may shrink the UTF string.  
  *  
  * Results:  
  *      Returns the number of bytes in the resulting string  
  *      excluding the trailing null.  
  *  
  * Side effects:  
  *      Writes a terminating null after the last converted character.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfToTitle(str)  
     char *str;                  /* String to convert in place. */  
 {  
     Tcl_UniChar ch, titleChar, lowChar;  
     char *src, *dst;  
     int bytes;  
       
     /*  
      * Capitalize the first character and then lowercase the rest of the  
      * characters until we get to a null.  
      */  
   
     src = dst = str;  
   
     if (*src) {  
         bytes = Tcl_UtfToUniChar(src, &ch);  
         titleChar = Tcl_UniCharToTitle(ch);  
   
         if (bytes < UtfCount(titleChar)) {  
             memcpy(dst, src, (size_t) bytes);  
             dst += bytes;  
         } else {  
             dst += Tcl_UniCharToUtf(titleChar, dst);  
         }  
         src += bytes;  
     }  
     while (*src) {  
         bytes = Tcl_UtfToUniChar(src, &ch);  
         lowChar = Tcl_UniCharToLower(ch);  
   
         if (bytes < UtfCount(lowChar)) {  
             memcpy(dst, src, (size_t) bytes);  
             dst += bytes;  
         } else {  
             dst += Tcl_UniCharToUtf(lowChar, dst);  
         }  
         src += bytes;  
     }  
     *dst = '\0';  
     return (dst - str);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UtfNcmp --  
  *  
  *      Compare at most n UTF chars of string cs to string ct.  Both cs  
  *      and ct are assumed to be at least n UTF chars long.  
  *  
  * Results:  
  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfNcmp(cs, ct, n)  
     CONST char *cs;             /* UTF string to compare to ct. */  
     CONST char *ct;             /* UTF string cs is compared to. */  
     unsigned long n;            /* Number of UTF chars to compare. */  
 {  
     Tcl_UniChar ch1, ch2;  
     /*  
      * Another approach that should work is:  
      *   return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));  
      * That assumes that ct is a properly formed UTF, so we will just  
      * be comparing the bytes that compromise those strings to the  
      * char length n.  
      */  
     while (n-- > 0) {  
         /*  
          * n must be interpreted as chars, not bytes.  
          * This should be called only when both strings are of  
          * at least n chars long (no need for \0 check)  
          */  
         cs += Tcl_UtfToUniChar(cs, &ch1);  
         ct += Tcl_UtfToUniChar(ct, &ch2);  
         if (ch1 != ch2) {  
             return (ch1 - ch2);  
         }  
     }  
     return 0;  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UtfNcasecmp --  
  *  
  *      Compare at most n UTF chars of string cs to string ct case  
  *      insensitive.  Both cs and ct are assumed to be at least n  
  *      UTF chars long.  
  *  
  * Results:  
  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UtfNcasecmp(cs, ct, n)  
     CONST char *cs;             /* UTF string to compare to ct. */  
     CONST char *ct;             /* UTF string cs is compared to. */  
     unsigned long n;                    /* Number of UTF chars to compare. */  
 {  
     Tcl_UniChar ch1, ch2;  
     while (n-- > 0) {  
         /*  
          * n must be interpreted as chars, not bytes.  
          * This should be called only when both strings are of  
          * at least n chars long (no need for \0 check)  
          */  
         cs += Tcl_UtfToUniChar(cs, &ch1);  
         ct += Tcl_UtfToUniChar(ct, &ch2);  
         if (ch1 != ch2) {  
             ch1 = Tcl_UniCharToLower(ch1);  
             ch2 = Tcl_UniCharToLower(ch2);  
             if (ch1 != ch2) {  
                 return (ch1 - ch2);  
             }  
         }  
     }  
     return 0;  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharToUpper --  
  *  
  *      Compute the uppercase equivalent of the given Unicode character.  
  *  
  * Results:  
  *      Returns the uppercase Unicode character.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 Tcl_UniChar  
 Tcl_UniCharToUpper(ch)  
     int ch;                     /* Unicode character to convert. */  
 {  
     int info = GetUniCharInfo(ch);  
   
     if (GetCaseType(info) & 0x04) {  
         return (Tcl_UniChar) (ch - GetDelta(info));  
     } else {  
         return ch;  
     }  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharToLower --  
  *  
  *      Compute the lowercase equivalent of the given Unicode character.  
  *  
  * Results:  
  *      Returns the lowercase Unicode character.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 Tcl_UniChar  
 Tcl_UniCharToLower(ch)  
     int ch;                     /* Unicode character to convert. */  
 {  
     int info = GetUniCharInfo(ch);  
   
     if (GetCaseType(info) & 0x02) {  
         return (Tcl_UniChar) (ch + GetDelta(info));  
     } else {  
         return ch;  
     }  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharToTitle --  
  *  
  *      Compute the titlecase equivalent of the given Unicode character.  
  *  
  * Results:  
  *      Returns the titlecase Unicode character.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 Tcl_UniChar  
 Tcl_UniCharToTitle(ch)  
     int ch;                     /* Unicode character to convert. */  
 {  
     int info = GetUniCharInfo(ch);  
     int mode = GetCaseType(info);  
   
     if (mode & 0x1) {  
         /*  
          * Subtract or add one depending on the original case.  
          */  
   
         return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));  
     } else if (mode == 0x4) {  
         return (Tcl_UniChar) (ch - GetDelta(info));  
     } else {  
         return ch;  
     }  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharLen --  
  *  
  *      Find the length of a UniChar string.  The str input must be null  
  *      terminated.  
  *  
  * Results:  
  *      Returns the length of str in UniChars (not bytes).  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharLen(str)  
     Tcl_UniChar *str;           /* Unicode string to find length of. */  
 {  
     int len = 0;  
       
     while (*str != '\0') {  
         len++;  
         str++;  
     }  
     return len;  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharNcmp --  
  *  
  *      Compare at most n unichars of string cs to string ct.  Both cs  
  *      and ct are assumed to be at least n unichars long.  
  *  
  * Results:  
  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharNcmp(cs, ct, n)  
     CONST Tcl_UniChar *cs;              /* Unicode string to compare to ct. */  
     CONST Tcl_UniChar *ct;              /* Unicode string cs is compared to. */  
     unsigned long n;                    /* Number of unichars to compare. */  
 {  
     for ( ; n != 0; n--, cs++, ct++) {  
         if (*cs != *ct) {  
             return *cs - *ct;  
         }  
         if (*cs == '\0') {  
             break;  
         }  
     }  
     return 0;  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsAlnum --  
  *  
  *      Test if a character is an alphanumeric Unicode character.  
  *  
  * Results:  
  *      Returns 1 if character is alphanumeric.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsAlnum(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
   
     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsAlpha --  
  *  
  *      Test if a character is an alphabetic Unicode character.  
  *  
  * Results:  
  *      Returns 1 if character is alphabetic.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsAlpha(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
     return ((ALPHA_BITS >> category) & 1);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsControl --  
  *  
  *      Test if a character is a Unicode control character.  
  *  
  * Results:  
  *      Returns non-zero if character is a control.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsControl(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsDigit --  
  *  
  *      Test if a character is a numeric Unicode character.  
  *  
  * Results:  
  *      Returns non-zero if character is a digit.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsDigit(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)  
             == DECIMAL_DIGIT_NUMBER);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsGraph --  
  *  
  *      Test if a character is any Unicode print character except space.  
  *  
  * Results:  
  *      Returns non-zero if character is printable, but not space.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsGraph(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsLower --  
  *  
  *      Test if a character is a lowercase Unicode character.  
  *  
  * Results:  
  *      Returns non-zero if character is lowercase.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsLower(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsPrint --  
  *  
  *      Test if a character is a Unicode print character.  
  *  
  * Results:  
  *      Returns non-zero if character is printable.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsPrint(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
     return ((PRINT_BITS >> category) & 1);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsPunct --  
  *  
  *      Test if a character is a Unicode punctuation character.  
  *  
  * Results:  
  *      Returns non-zero if character is punct.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsPunct(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
     return ((PUNCT_BITS >> category) & 1);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsSpace --  
  *  
  *      Test if a character is a whitespace Unicode character.  
  *  
  * Results:  
  *      Returns non-zero if character is a space.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsSpace(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category;  
   
     /*  
      * If the character is within the first 127 characters, just use the  
      * standard C function, otherwise consult the Unicode table.  
      */  
   
     if (ch < 0x80) {  
         return isspace(UCHAR(ch)); /* INTL: ISO space */  
     } else {  
         category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
         return ((SPACE_BITS >> category) & 1);  
     }  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsUpper --  
  *  
  *      Test if a character is a uppercase Unicode character.  
  *  
  * Results:  
  *      Returns non-zero if character is uppercase.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsUpper(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);  
 }  
   
 /*  
  *----------------------------------------------------------------------  
  *  
  * Tcl_UniCharIsWordChar --  
  *  
  *      Test if a character is alphanumeric or a connector punctuation  
  *      mark.  
  *  
  * Results:  
  *      Returns 1 if character is a word character.  
  *  
  * Side effects:  
  *      None.  
  *  
  *----------------------------------------------------------------------  
  */  
   
 int  
 Tcl_UniCharIsWordChar(ch)  
     int ch;                     /* Unicode character to test. */  
 {  
     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);  
   
     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);  
 }  
   
   
 /* $History: tclutf.c $  
  *  
  * *****************  Version 1  *****************  
  * User: Dtashley     Date: 1/02/01    Time: 1:05a  
  * Created in $/IjuScripter, IjuConsole/Source/Tcl Base  
  * Initial check-in.  
  */  
   
 /* End of TCL_UTF.C */  
1    /* $Header$ */
2    /*
3     * tclUtf.c --
4     *
5     *      Routines for manipulating UTF-8 strings.
6     *
7     * Copyright (c) 1997-1998 Sun Microsystems, Inc.
8     *
9     * See the file "license.terms" for information on usage and redistribution
10     * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11     *
12     * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $
13     */
14    
15    #include "tclInt.h"
16    
17    /*
18     * Include the static character classification tables and macros.
19     */
20    
21    #include "tclUniData.c"
22    
23    /*
24     * The following macros are used for fast character category tests.  The
25     * x_BITS values are shifted right by the category value to determine whether
26     * the given category is included in the set.
27     */
28    
29    #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
30        | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
31    
32    #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
33    
34    #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
35        | (1 << PARAGRAPH_SEPARATOR))
36    
37    #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
38    
39    #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
40                (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
41                (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
42                (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
43                (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
44                (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
45                (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
46                (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
47                (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
48    
49    #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
50                (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
51                (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
52                (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
53    
54    /*
55     * Unicode characters less than this value are represented by themselves
56     * in UTF-8 strings.
57     */
58    
59    #define UNICODE_SELF    0x80
60    
61    /*
62     * The following structures are used when mapping between Unicode (UCS-2)
63     * and UTF-8.
64     */
65    
66    CONST unsigned char totalBytes[256] = {
67        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
75    #if TCL_UTF_MAX > 3
76        4,4,4,4,4,4,4,4,
77    #else
78        1,1,1,1,1,1,1,1,
79    #endif
80    #if TCL_UTF_MAX > 4
81        5,5,5,5,
82    #else
83        1,1,1,1,
84    #endif
85    #if TCL_UTF_MAX > 5
86        6,6,6,6
87    #else
88        1,1,1,1
89    #endif
90    };
91    
92    /*
93     * Procedures used only in this module.
94     */
95    
96    static int UtfCount _ANSI_ARGS_((int ch));
97    
98    
99    /*
100     *---------------------------------------------------------------------------
101     *
102     * UtfCount --
103     *
104     *      Find the number of bytes in the Utf character "ch".
105     *
106     * Results:
107     *      The return values is the number of bytes in the Utf character "ch".
108     *
109     * Side effects:
110     *      None.
111     *
112     *---------------------------------------------------------------------------
113     */
114    
115    static int
116    UtfCount(ch)
117        int ch;                     /* The Tcl_UniChar whose size is returned. */
118    {
119        if ((ch > 0) && (ch < UNICODE_SELF)) {
120            return 1;
121        }
122        if (ch <= 0x7FF) {
123            return 2;
124        }
125        if (ch <= 0xFFFF) {
126            return 3;
127        }
128    #if TCL_UTF_MAX > 3
129        if (ch <= 0x1FFFFF) {
130            return 4;
131        }
132        if (ch <= 0x3FFFFFF) {
133            return 5;
134        }
135        if (ch <= 0x7FFFFFFF) {
136            return 6;
137        }
138    #endif
139        return 3;
140    }
141    
142    /*
143     *---------------------------------------------------------------------------
144     *
145     * Tcl_UniCharToUtf --
146     *
147     *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
148     *      provided buffer.  Equivalent to Plan 9 runetochar().
149     *
150     * Results:
151     *      The return values is the number of bytes in the buffer that
152     *      were consumed.  
153     *
154     * Side effects:
155     *      None.
156     *
157     *---------------------------------------------------------------------------
158     */
159    
160    INLINE int
161    Tcl_UniCharToUtf(ch, str)
162        int ch;                     /* The Tcl_UniChar to be stored in the
163                                     * buffer. */
164        char *str;                  /* Buffer in which the UTF-8 representation
165                                     * of the Tcl_UniChar is stored.  Buffer must
166                                     * be large enough to hold the UTF-8 character
167                                     * (at most TCL_UTF_MAX bytes). */
168    {
169        if ((ch > 0) && (ch < UNICODE_SELF)) {
170            str[0] = (char) ch;
171            return 1;
172        }
173        if (ch <= 0x7FF) {
174            str[1] = (char) ((ch | 0x80) & 0xBF);
175            str[0] = (char) ((ch >> 6) | 0xC0);
176            return 2;
177        }
178        if (ch <= 0xFFFF) {
179            three:
180            str[2] = (char) ((ch | 0x80) & 0xBF);
181            str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
182            str[0] = (char) ((ch >> 12) | 0xE0);
183            return 3;
184        }
185    
186    #if TCL_UTF_MAX > 3
187        if (ch <= 0x1FFFFF) {
188            str[3] = (char) ((ch | 0x80) & 0xBF);
189            str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
190            str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
191            str[0] = (char) ((ch >> 18) | 0xF0);
192            return 4;
193        }
194        if (ch <= 0x3FFFFFF) {
195            str[4] = (char) ((ch | 0x80) & 0xBF);
196            str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
197            str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
198            str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
199            str[0] = (char) ((ch >> 24) | 0xF8);
200            return 5;
201        }
202        if (ch <= 0x7FFFFFFF) {
203            str[5] = (char) ((ch | 0x80) & 0xBF);
204            str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
205            str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
206            str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
207            str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
208            str[0] = (char) ((ch >> 30) | 0xFC);
209            return 6;
210        }
211    #endif
212    
213        ch = 0xFFFD;
214        goto three;
215    }
216    
217    /*
218     *---------------------------------------------------------------------------
219     *
220     * Tcl_UniCharToUtfDString --
221     *
222     *      Convert the given Unicode string to UTF-8.
223     *
224     * Results:
225     *      The return value is a pointer to the UTF-8 representation of the
226     *      Unicode string.  Storage for the return value is appended to the
227     *      end of dsPtr.
228     *
229     * Side effects:
230     *      None.
231     *
232     *---------------------------------------------------------------------------
233     */
234    
235    char *
236    Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
237        CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
238        int numChars;               /* Length of Unicode string in Tcl_UniChars
239                                     * (must be >= 0). */
240        Tcl_DString *dsPtr;         /* UTF-8 representation of string is
241                                     * appended to this previously initialized
242                                     * DString. */
243    {
244        CONST Tcl_UniChar *w, *wEnd;
245        char *p, *string;
246        int oldLength;
247    
248        /*
249         * UTF-8 string length in bytes will be <= Unicode string length *
250         * TCL_UTF_MAX.
251         */
252    
253        oldLength = Tcl_DStringLength(dsPtr);
254        Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
255        string = Tcl_DStringValue(dsPtr) + oldLength;
256    
257        p = string;
258        wEnd = wString + numChars;
259        for (w = wString; w < wEnd; ) {
260            p += Tcl_UniCharToUtf(*w, p);
261            w++;
262        }
263        Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
264    
265        return string;
266    }
267    
268    /*
269     *---------------------------------------------------------------------------
270     *
271     * Tcl_UtfToUniChar --
272     *
273     *      Extract the Tcl_UniChar represented by the UTF-8 string.  Bad
274     *      UTF-8 sequences are converted to valid Tcl_UniChars and processing
275     *      continues.  Equivalent to Plan 9 chartorune().
276     *
277     *      The caller must ensure that the source buffer is long enough that
278     *      this routine does not run off the end and dereference non-existent
279     *      memory looking for trail bytes.  If the source buffer is known to
280     *      be '\0' terminated, this cannot happen.  Otherwise, the caller
281     *      should call Tcl_UtfCharComplete() before calling this routine to
282     *      ensure that enough bytes remain in the string.
283     *
284     * Results:
285     *      *chPtr is filled with the Tcl_UniChar, and the return value is the
286     *      number of bytes from the UTF-8 string that were consumed.
287     *
288     * Side effects:
289     *      None.
290     *
291     *---------------------------------------------------------------------------
292     */
293    
294    int
295    Tcl_UtfToUniChar(str, chPtr)
296        register CONST char *str;    /* The UTF-8 string. */
297        register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
298                                      * by the UTF-8 string. */
299    {
300        register int byte;
301        
302        /*
303         * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
304         */
305    
306        byte = *((unsigned char *) str);
307        if (byte < 0xC0) {
308            /*
309             * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
310             * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
311             * characters representing themselves.
312             */
313            
314            *chPtr = (Tcl_UniChar) byte;
315            return 1;
316        } else if (byte < 0xE0) {
317            if ((str[1] & 0xC0) == 0x80) {
318                /*
319                 * Two-byte-character lead-byte followed by a trail-byte.
320                 */
321                
322                *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
323                return 2;
324            }
325            /*
326             * A two-byte-character lead-byte not followed by trail-byte
327             * represents itself.
328             */
329            
330            *chPtr = (Tcl_UniChar) byte;
331            return 1;
332        } else if (byte < 0xF0) {
333            if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
334                /*
335                 * Three-byte-character lead byte followed by two trail bytes.
336                 */
337    
338                *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339                        | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
340                return 3;
341            }
342            /*
343             * A three-byte-character lead-byte not followed by two trail-bytes
344             * represents itself.
345             */
346    
347            *chPtr = (Tcl_UniChar) byte;
348            return 1;
349        }
350    #if TCL_UTF_MAX > 3
351        else {
352            int ch, total, trail;
353    
354            total = totalBytes[byte];
355            trail = total - 1;
356            if (trail > 0) {
357                ch = byte & (0x3F >> trail);
358                do {
359                    str++;
360                    if ((*str & 0xC0) != 0x80) {
361                        *chPtr = byte;
362                        return 1;
363                    }
364                    ch <<= 6;
365                    ch |= (*str & 0x3F);
366                    trail--;
367                } while (trail > 0);
368                *chPtr = ch;
369                return total;
370            }
371        }
372    #endif
373    
374        *chPtr = (Tcl_UniChar) byte;
375        return 1;
376    }
377    
378    /*
379     *---------------------------------------------------------------------------
380     *
381     * Tcl_UtfToUniCharDString --
382     *
383     *      Convert the UTF-8 string to Unicode.
384     *
385     * Results:
386     *      The return value is a pointer to the Unicode representation of the
387     *      UTF-8 string.  Storage for the return value is appended to the
388     *      end of dsPtr.  The Unicode string is terminated with a Unicode
389     *      NULL character.
390     *
391     * Side effects:
392     *      None.
393     *
394     *---------------------------------------------------------------------------
395     */
396    
397    Tcl_UniChar *
398    Tcl_UtfToUniCharDString(string, length, dsPtr)
399        CONST char *string;         /* UTF-8 string to convert to Unicode. */
400        int length;                 /* Length of UTF-8 string in bytes, or -1
401                                     * for strlen(). */
402        Tcl_DString *dsPtr;         /* Unicode representation of string is
403                                     * appended to this previously initialized
404                                     * DString. */
405    {
406        Tcl_UniChar *w, *wString;
407        CONST char *p, *end;
408        int oldLength;
409    
410        if (length < 0) {
411            length = strlen(string);
412        }
413    
414        /*
415         * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
416         * in bytes.
417         */
418    
419        oldLength = Tcl_DStringLength(dsPtr);
420        Tcl_DStringSetLength(dsPtr,
421                (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422        wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423    
424        w = wString;
425        end = string + length;
426        for (p = string; p < end; ) {
427            p += Tcl_UtfToUniChar(p, w);
428            w++;
429        }
430        *w = '\0';
431        Tcl_DStringSetLength(dsPtr,
432                (oldLength + ((char *) w - (char *) wString)));
433    
434        return wString;
435    }
436    
437    /*
438     *---------------------------------------------------------------------------
439     *
440     * Tcl_UtfCharComplete --
441     *
442     *      Determine if the UTF-8 string of the given length is long enough
443     *      to be decoded by Tcl_UtfToUniChar().  This does not ensure that the
444     *      UTF-8 string is properly formed.  Equivalent to Plan 9 fullrune().
445     *
446     * Results:
447     *      The return value is 0 if the string is not long enough, non-zero
448     *      otherwise.
449     *
450     * Side effects:
451     *      None.
452     *
453     *---------------------------------------------------------------------------
454     */
455    
456    int
457    Tcl_UtfCharComplete(str, len)
458        CONST char *str;            /* String to check if first few bytes
459                                     * contain a complete UTF-8 character. */
460        int len;                    /* Length of above string in bytes. */
461    {
462        int ch;
463    
464        ch = *((unsigned char *) str);
465        return len >= totalBytes[ch];
466    }
467    
468    /*
469     *---------------------------------------------------------------------------
470     *
471     * Tcl_NumUtfChars --
472     *
473     *      Returns the number of characters (not bytes) in the UTF-8 string,
474     *      not including the terminating NULL byte.  This is equivalent to
475     *      Plan 9 utflen() and utfnlen().
476     *
477     * Results:
478     *      As above.  
479     *
480     * Side effects:
481     *      None.
482     *
483     *---------------------------------------------------------------------------
484     */
485    
486    int
487    Tcl_NumUtfChars(str, len)
488        register CONST char *str;   /* The UTF-8 string to measure. */
489        int len;                    /* The length of the string in bytes, or -1
490                                     * for strlen(string). */
491    {
492        Tcl_UniChar ch;
493        register Tcl_UniChar *chPtr = &ch;
494        register int n;
495        int i;
496    
497        /*
498         * The separate implementations are faster.
499         */
500        
501        i = 0;
502        if (len < 0) {
503            while (1) {
504                str += Tcl_UtfToUniChar(str, chPtr);
505                if (ch == '\0') {
506                    break;
507                }
508                i++;
509            }
510        } else {
511            while (len > 0) {
512                n = Tcl_UtfToUniChar(str, chPtr);
513                len -= n;
514                str += n;
515                i++;
516            }
517        }
518        return i;
519    }
520    
521    /*
522     *---------------------------------------------------------------------------
523     *
524     * Tcl_UtfFindFirst --
525     *
526     *      Returns a pointer to the first occurance of the given Tcl_UniChar
527     *      in the NULL-terminated UTF-8 string.  The NULL terminator is
528     *      considered part of the UTF-8 string.  Equivalent to Plan 9
529     *      utfrune().
530     *
531     * Results:
532     *      As above.  If the Tcl_UniChar does not exist in the given string,
533     *      the return value is NULL.
534     *
535     * Side effects:
536     *      None.
537     *
538     *---------------------------------------------------------------------------
539     */
540    char *
541    Tcl_UtfFindFirst(string, ch)
542        CONST char *string;         /* The UTF-8 string to be searched. */
543        int ch;                     /* The Tcl_UniChar to search for. */
544    {
545        int len;
546        Tcl_UniChar find;
547        
548        while (1) {
549            len = Tcl_UtfToUniChar(string, &find);
550            if (find == ch) {
551                return (char *) string;
552            }
553            if (*string == '\0') {
554                return NULL;
555            }
556            string += len;
557        }
558    }
559    
560    /*
561     *---------------------------------------------------------------------------
562     *
563     * Tcl_UtfFindLast --
564     *
565     *      Returns a pointer to the last occurance of the given Tcl_UniChar
566     *      in the NULL-terminated UTF-8 string.  The NULL terminator is
567     *      considered part of the UTF-8 string.  Equivalent to Plan 9
568     *      utfrrune().
569     *
570     * Results:
571     *      As above.  If the Tcl_UniChar does not exist in the given string,
572     *      the return value is NULL.
573     *
574     * Side effects:
575     *      None.
576     *
577     *---------------------------------------------------------------------------
578     */
579    
580    char *
581    Tcl_UtfFindLast(string, ch)
582        CONST char *string;         /* The UTF-8 string to be searched. */
583        int ch;                     /* The Tcl_UniChar to search for. */
584    {
585        int len;
586        Tcl_UniChar find;
587        CONST char *last;
588            
589        last = NULL;
590        while (1) {
591            len = Tcl_UtfToUniChar(string, &find);
592            if (find == ch) {
593                last = string;
594            }
595            if (*string == '\0') {
596                break;
597            }
598            string += len;
599        }
600        return (char *) last;
601    }
602    
603    /*
604     *---------------------------------------------------------------------------
605     *
606     * Tcl_UtfNext --
607     *
608     *      Given a pointer to some current location in a UTF-8 string,
609     *      move forward one character.  The caller must ensure that they
610     *      are not asking for the next character after the last character
611     *      in the string.
612     *
613     * Results:
614     *      The return value is the pointer to the next character in
615     *      the UTF-8 string.
616     *
617     * Side effects:
618     *      None.
619     *
620     *---------------------------------------------------------------------------
621     */
622    
623    char *
624    Tcl_UtfNext(str)
625        CONST char *str;                /* The current location in the string. */
626    {
627        Tcl_UniChar ch;
628    
629        return (char *) str + Tcl_UtfToUniChar(str, &ch);
630    }
631    
632    /*
633     *---------------------------------------------------------------------------
634     *
635     * Tcl_UtfPrev --
636     *
637     *      Given a pointer to some current location in a UTF-8 string,
638     *      move backwards one character.
639     *
640     * Results:
641     *      The return value is a pointer to the previous character in the
642     *      UTF-8 string.  If the current location was already at the
643     *      beginning of the string, the return value will also be a
644     *      pointer to the beginning of the string.
645     *
646     * Side effects:
647     *      None.
648     *
649     *---------------------------------------------------------------------------
650     */
651    
652    char *
653    Tcl_UtfPrev(str, start)
654        CONST char *str;                /* The current location in the string. */
655        CONST char *start;              /* Pointer to the beginning of the
656                                         * string, to avoid going backwards too
657                                         * far. */
658    {
659        CONST char *look;
660        int i, byte;
661        
662        str--;
663        look = str;
664        for (i = 0; i < TCL_UTF_MAX; i++) {
665            if (look < start) {
666                if (str < start) {
667                    str = start;
668                }
669                break;
670            }
671            byte = *((unsigned char *) look);
672            if (byte < 0x80) {
673                break;
674            }
675            if (byte >= 0xC0) {
676                if (totalBytes[byte] != i + 1) {
677                    break;
678                }
679                return (char *) look;
680            }
681            look--;
682        }
683        return (char *) str;
684    }
685          
686    /*
687     *---------------------------------------------------------------------------
688     *
689     * Tcl_UniCharAtIndex --
690     *
691     *      Returns the Unicode character represented at the specified
692     *      character (not byte) position in the UTF-8 string.
693     *
694     * Results:
695     *      As above.
696     *
697     * Side effects:
698     *      None.
699     *
700     *---------------------------------------------------------------------------
701     */
702    
703    Tcl_UniChar
704    Tcl_UniCharAtIndex(src, index)
705        register CONST char *src;   /* The UTF-8 string to dereference. */
706        register int index;         /* The position of the desired character. */
707    {
708        Tcl_UniChar ch;
709    
710        while (index >= 0) {
711            index--;
712            src += Tcl_UtfToUniChar(src, &ch);
713        }
714        return ch;
715    }
716    
717    /*
718     *---------------------------------------------------------------------------
719     *
720     * Tcl_UtfAtIndex --
721     *
722     *      Returns a pointer to the specified character (not byte) position
723     *      in the UTF-8 string.
724     *
725     * Results:
726     *      As above.
727     *
728     * Side effects:
729     *      None.
730     *
731     *---------------------------------------------------------------------------
732     */
733    
734    char *
735    Tcl_UtfAtIndex(src, index)
736        register CONST char *src;   /* The UTF-8 string. */
737        register int index;         /* The position of the desired character. */
738    {
739        Tcl_UniChar ch;
740        
741        while (index > 0) {
742            index--;
743            src += Tcl_UtfToUniChar(src, &ch);
744        }
745        return (char *) src;
746    }
747    
748    /*
749     *---------------------------------------------------------------------------
750     *
751     * Tcl_UtfBackslash --
752     *
753     *      Figure out how to handle a backslash sequence.
754     *
755     * Results:
756     *      Stores the bytes represented by the backslash sequence in dst and
757     *      returns the number of bytes written to dst.  At most TCL_UTF_MAX
758     *      bytes are written to dst; dst must have been large enough to accept
759     *      those bytes.  If readPtr isn't NULL then it is filled in with a
760     *      count of the number of bytes in the backslash sequence.  
761     *
762     * Side effects:
763     *      The maximum number of bytes it takes to represent a Unicode
764     *      character in UTF-8 is guaranteed to be less than the number of
765     *      bytes used to express the backslash sequence that represents
766     *      that Unicode character.  If the target buffer into which the
767     *      caller is going to store the bytes that represent the Unicode
768     *      character is at least as large as the source buffer from which
769     *      the backslashed sequence was extracted, no buffer overruns should
770     *      occur.
771     *
772     *---------------------------------------------------------------------------
773     */
774    
775    int
776    Tcl_UtfBackslash(src, readPtr, dst)
777        CONST char *src;            /* Points to the backslash character of
778                                     * a backslash sequence. */
779        int *readPtr;               /* Fill in with number of characters read
780                                     * from src, unless NULL. */
781        char *dst;                  /* Filled with the bytes represented by the
782                                     * backslash sequence. */
783    {
784        register CONST char *p = src+1;
785        int result, count, n;
786        char buf[TCL_UTF_MAX];
787    
788        if (dst == NULL) {
789            dst = buf;
790        }
791    
792        count = 2;
793        switch (*p) {
794            /*
795             * Note: in the conversions below, use absolute values (e.g.,
796             * 0xa) rather than symbolic values (e.g. \n) that get converted
797             * by the compiler.  It's possible that compilers on some
798             * platforms will do the symbolic conversions differently, which
799             * could result in non-portable Tcl scripts.
800             */
801    
802            case 'a':
803                result = 0x7;
804                break;
805            case 'b':
806                result = 0x8;
807                break;
808            case 'f':
809                result = 0xc;
810                break;
811            case 'n':
812                result = 0xa;
813                break;
814            case 'r':
815                result = 0xd;
816                break;
817            case 't':
818                result = 0x9;
819                break;
820            case 'v':
821                result = 0xb;
822                break;
823            case 'x':
824                if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
825                    char *end;
826    
827                    result = (unsigned char) strtoul(p+1, &end, 16);
828                    count = end - src;
829                } else {
830                    count = 2;
831                    result = 'x';
832                }
833                break;
834            case 'u':
835                result = 0;
836                for (count = 0; count < 4; count++) {
837                    p++;
838                    if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
839                        break;
840                    }
841                    n = *p - '0';
842                    if (n > 9) {
843                        n = n + '0' + 10 - 'A';
844                    }
845                    if (n > 16) {
846                        n = n + 'A' - 'a';
847                    }
848                    result = (result << 4) + n;
849                }
850                if (count == 0) {
851                    result = 'u';
852                }
853                count += 2;
854                break;
855                        
856            case '\n':
857                do {
858                    p++;
859                } while ((*p == ' ') || (*p == '\t'));
860                result = ' ';
861                count = p - src;
862                break;
863            case 0:
864                result = '\\';
865                count = 1;
866                break;
867            default:
868                /*
869                 * Check for an octal number \oo?o?
870                 */
871                if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
872                    result = (unsigned char)(*p - '0');
873                    p++;
874                    if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
875                        break;
876                    }
877                    count = 3;
878                    result = (unsigned char)((result << 3) + (*p - '0'));
879                    p++;
880                    if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
881                        break;
882                    }
883                    count = 4;
884                    result = (unsigned char)((result << 3) + (*p - '0'));
885                    break;
886                }
887                result = *p;
888                count = 2;
889                break;
890        }
891    
892        if (readPtr != NULL) {
893            *readPtr = count;
894        }
895        return Tcl_UniCharToUtf(result, dst);
896    }
897    
898    /*
899     *----------------------------------------------------------------------
900     *
901     * Tcl_UtfToUpper --
902     *
903     *      Convert lowercase characters to uppercase characters in a UTF
904     *      string in place.  The conversion may shrink the UTF string.
905     *
906     * Results:
907     *      Returns the number of bytes in the resulting string
908     *      excluding the trailing null.
909     *
910     * Side effects:
911     *      Writes a terminating null after the last converted character.
912     *
913     *----------------------------------------------------------------------
914     */
915    
916    int
917    Tcl_UtfToUpper(str)
918        char *str;                  /* String to convert in place. */
919    {
920        Tcl_UniChar ch, upChar;
921        char *src, *dst;
922        int bytes;
923    
924        /*
925         * Iterate over the string until we hit the terminating null.
926         */
927    
928        src = dst = str;
929        while (*src) {
930            bytes = Tcl_UtfToUniChar(src, &ch);
931            upChar = Tcl_UniCharToUpper(ch);
932    
933            /*
934             * To keep badly formed Utf strings from getting inflated by
935             * the conversion (thereby causing a segfault), only copy the
936             * upper case char to dst if its size is <= the original char.
937             */
938            
939            if (bytes < UtfCount(upChar)) {
940                memcpy(dst, src, (size_t) bytes);
941                dst += bytes;
942            } else {
943                dst += Tcl_UniCharToUtf(upChar, dst);
944            }
945            src += bytes;
946        }
947        *dst = '\0';
948        return (dst - str);
949    }
950    
951    /*
952     *----------------------------------------------------------------------
953     *
954     * Tcl_UtfToLower --
955     *
956     *      Convert uppercase characters to lowercase characters in a UTF
957     *      string in place.  The conversion may shrink the UTF string.
958     *
959     * Results:
960     *      Returns the number of bytes in the resulting string
961     *      excluding the trailing null.
962     *
963     * Side effects:
964     *      Writes a terminating null after the last converted character.
965     *
966     *----------------------------------------------------------------------
967     */
968    
969    int
970    Tcl_UtfToLower(str)
971        char *str;                  /* String to convert in place. */
972    {
973        Tcl_UniChar ch, lowChar;
974        char *src, *dst;
975        int bytes;
976        
977        /*
978         * Iterate over the string until we hit the terminating null.
979         */
980    
981        src = dst = str;
982        while (*src) {
983            bytes = Tcl_UtfToUniChar(src, &ch);
984            lowChar = Tcl_UniCharToLower(ch);
985    
986            /*
987             * To keep badly formed Utf strings from getting inflated by
988             * the conversion (thereby causing a segfault), only copy the
989             * lower case char to dst if its size is <= the original char.
990             */
991            
992            if (bytes < UtfCount(lowChar)) {
993                memcpy(dst, src, (size_t) bytes);
994                dst += bytes;
995            } else {
996                dst += Tcl_UniCharToUtf(lowChar, dst);
997            }
998            src += bytes;
999        }
1000        *dst = '\0';
1001        return (dst - str);
1002    }
1003    
1004    /*
1005     *----------------------------------------------------------------------
1006     *
1007     * Tcl_UtfToTitle --
1008     *
1009     *      Changes the first character of a UTF string to title case or
1010     *      uppercase and the rest of the string to lowercase.  The
1011     *      conversion happens in place and may shrink the UTF string.
1012     *
1013     * Results:
1014     *      Returns the number of bytes in the resulting string
1015     *      excluding the trailing null.
1016     *
1017     * Side effects:
1018     *      Writes a terminating null after the last converted character.
1019     *
1020     *----------------------------------------------------------------------
1021     */
1022    
1023    int
1024    Tcl_UtfToTitle(str)
1025        char *str;                  /* String to convert in place. */
1026    {
1027        Tcl_UniChar ch, titleChar, lowChar;
1028        char *src, *dst;
1029        int bytes;
1030        
1031        /*
1032         * Capitalize the first character and then lowercase the rest of the
1033         * characters until we get to a null.
1034         */
1035    
1036        src = dst = str;
1037    
1038        if (*src) {
1039            bytes = Tcl_UtfToUniChar(src, &ch);
1040            titleChar = Tcl_UniCharToTitle(ch);
1041    
1042            if (bytes < UtfCount(titleChar)) {
1043                memcpy(dst, src, (size_t) bytes);
1044                dst += bytes;
1045            } else {
1046                dst += Tcl_UniCharToUtf(titleChar, dst);
1047            }
1048            src += bytes;
1049        }
1050        while (*src) {
1051            bytes = Tcl_UtfToUniChar(src, &ch);
1052            lowChar = Tcl_UniCharToLower(ch);
1053    
1054            if (bytes < UtfCount(lowChar)) {
1055                memcpy(dst, src, (size_t) bytes);
1056                dst += bytes;
1057            } else {
1058                dst += Tcl_UniCharToUtf(lowChar, dst);
1059            }
1060            src += bytes;
1061        }
1062        *dst = '\0';
1063        return (dst - str);
1064    }
1065    
1066    /*
1067     *----------------------------------------------------------------------
1068     *
1069     * Tcl_UtfNcmp --
1070     *
1071     *      Compare at most n UTF chars of string cs to string ct.  Both cs
1072     *      and ct are assumed to be at least n UTF chars long.
1073     *
1074     * Results:
1075     *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1076     *
1077     * Side effects:
1078     *      None.
1079     *
1080     *----------------------------------------------------------------------
1081     */
1082    
1083    int
1084    Tcl_UtfNcmp(cs, ct, n)
1085        CONST char *cs;             /* UTF string to compare to ct. */
1086        CONST char *ct;             /* UTF string cs is compared to. */
1087        unsigned long n;            /* Number of UTF chars to compare. */
1088    {
1089        Tcl_UniChar ch1, ch2;
1090        /*
1091         * Another approach that should work is:
1092         *   return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
1093         * That assumes that ct is a properly formed UTF, so we will just
1094         * be comparing the bytes that compromise those strings to the
1095         * char length n.
1096         */
1097        while (n-- > 0) {
1098            /*
1099             * n must be interpreted as chars, not bytes.
1100             * This should be called only when both strings are of
1101             * at least n chars long (no need for \0 check)
1102             */
1103            cs += Tcl_UtfToUniChar(cs, &ch1);
1104            ct += Tcl_UtfToUniChar(ct, &ch2);
1105            if (ch1 != ch2) {
1106                return (ch1 - ch2);
1107            }
1108        }
1109        return 0;
1110    }
1111    
1112    /*
1113     *----------------------------------------------------------------------
1114     *
1115     * Tcl_UtfNcasecmp --
1116     *
1117     *      Compare at most n UTF chars of string cs to string ct case
1118     *      insensitive.  Both cs and ct are assumed to be at least n
1119     *      UTF chars long.
1120     *
1121     * Results:
1122     *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1123     *
1124     * Side effects:
1125     *      None.
1126     *
1127     *----------------------------------------------------------------------
1128     */
1129    
1130    int
1131    Tcl_UtfNcasecmp(cs, ct, n)
1132        CONST char *cs;             /* UTF string to compare to ct. */
1133        CONST char *ct;             /* UTF string cs is compared to. */
1134        unsigned long n;                    /* Number of UTF chars to compare. */
1135    {
1136        Tcl_UniChar ch1, ch2;
1137        while (n-- > 0) {
1138            /*
1139             * n must be interpreted as chars, not bytes.
1140             * This should be called only when both strings are of
1141             * at least n chars long (no need for \0 check)
1142             */
1143            cs += Tcl_UtfToUniChar(cs, &ch1);
1144            ct += Tcl_UtfToUniChar(ct, &ch2);
1145            if (ch1 != ch2) {
1146                ch1 = Tcl_UniCharToLower(ch1);
1147                ch2 = Tcl_UniCharToLower(ch2);
1148                if (ch1 != ch2) {
1149                    return (ch1 - ch2);
1150                }
1151            }
1152        }
1153        return 0;
1154    }
1155    
1156    /*
1157     *----------------------------------------------------------------------
1158     *
1159     * Tcl_UniCharToUpper --
1160     *
1161     *      Compute the uppercase equivalent of the given Unicode character.
1162     *
1163     * Results:
1164     *      Returns the uppercase Unicode character.
1165     *
1166     * Side effects:
1167     *      None.
1168     *
1169     *----------------------------------------------------------------------
1170     */
1171    
1172    Tcl_UniChar
1173    Tcl_UniCharToUpper(ch)
1174        int ch;                     /* Unicode character to convert. */
1175    {
1176        int info = GetUniCharInfo(ch);
1177    
1178        if (GetCaseType(info) & 0x04) {
1179            return (Tcl_UniChar) (ch - GetDelta(info));
1180        } else {
1181            return ch;
1182        }
1183    }
1184    
1185    /*
1186     *----------------------------------------------------------------------
1187     *
1188     * Tcl_UniCharToLower --
1189     *
1190     *      Compute the lowercase equivalent of the given Unicode character.
1191     *
1192     * Results:
1193     *      Returns the lowercase Unicode character.
1194     *
1195     * Side effects:
1196     *      None.
1197     *
1198     *----------------------------------------------------------------------
1199     */
1200    
1201    Tcl_UniChar
1202    Tcl_UniCharToLower(ch)
1203        int ch;                     /* Unicode character to convert. */
1204    {
1205        int info = GetUniCharInfo(ch);
1206    
1207        if (GetCaseType(info) & 0x02) {
1208            return (Tcl_UniChar) (ch + GetDelta(info));
1209        } else {
1210            return ch;
1211        }
1212    }
1213    
1214    /*
1215     *----------------------------------------------------------------------
1216     *
1217     * Tcl_UniCharToTitle --
1218     *
1219     *      Compute the titlecase equivalent of the given Unicode character.
1220     *
1221     * Results:
1222     *      Returns the titlecase Unicode character.
1223     *
1224     * Side effects:
1225     *      None.
1226     *
1227     *----------------------------------------------------------------------
1228     */
1229    
1230    Tcl_UniChar
1231    Tcl_UniCharToTitle(ch)
1232        int ch;                     /* Unicode character to convert. */
1233    {
1234        int info = GetUniCharInfo(ch);
1235        int mode = GetCaseType(info);
1236    
1237        if (mode & 0x1) {
1238            /*
1239             * Subtract or add one depending on the original case.
1240             */
1241    
1242            return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1243        } else if (mode == 0x4) {
1244            return (Tcl_UniChar) (ch - GetDelta(info));
1245        } else {
1246            return ch;
1247        }
1248    }
1249    
1250    /*
1251     *----------------------------------------------------------------------
1252     *
1253     * Tcl_UniCharLen --
1254     *
1255     *      Find the length of a UniChar string.  The str input must be null
1256     *      terminated.
1257     *
1258     * Results:
1259     *      Returns the length of str in UniChars (not bytes).
1260     *
1261     * Side effects:
1262     *      None.
1263     *
1264     *----------------------------------------------------------------------
1265     */
1266    
1267    int
1268    Tcl_UniCharLen(str)
1269        Tcl_UniChar *str;           /* Unicode string to find length of. */
1270    {
1271        int len = 0;
1272        
1273        while (*str != '\0') {
1274            len++;
1275            str++;
1276        }
1277        return len;
1278    }
1279    
1280    /*
1281     *----------------------------------------------------------------------
1282     *
1283     * Tcl_UniCharNcmp --
1284     *
1285     *      Compare at most n unichars of string cs to string ct.  Both cs
1286     *      and ct are assumed to be at least n unichars long.
1287     *
1288     * Results:
1289     *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1290     *
1291     * Side effects:
1292     *      None.
1293     *
1294     *----------------------------------------------------------------------
1295     */
1296    
1297    int
1298    Tcl_UniCharNcmp(cs, ct, n)
1299        CONST Tcl_UniChar *cs;              /* Unicode string to compare to ct. */
1300        CONST Tcl_UniChar *ct;              /* Unicode string cs is compared to. */
1301        unsigned long n;                    /* Number of unichars to compare. */
1302    {
1303        for ( ; n != 0; n--, cs++, ct++) {
1304            if (*cs != *ct) {
1305                return *cs - *ct;
1306            }
1307            if (*cs == '\0') {
1308                break;
1309            }
1310        }
1311        return 0;
1312    }
1313    
1314    /*
1315     *----------------------------------------------------------------------
1316     *
1317     * Tcl_UniCharIsAlnum --
1318     *
1319     *      Test if a character is an alphanumeric Unicode character.
1320     *
1321     * Results:
1322     *      Returns 1 if character is alphanumeric.
1323     *
1324     * Side effects:
1325     *      None.
1326     *
1327     *----------------------------------------------------------------------
1328     */
1329    
1330    int
1331    Tcl_UniCharIsAlnum(ch)
1332        int ch;                     /* Unicode character to test. */
1333    {
1334        register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1335    
1336        return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1337    }
1338    
1339    /*
1340     *----------------------------------------------------------------------
1341     *
1342     * Tcl_UniCharIsAlpha --
1343     *
1344     *      Test if a character is an alphabetic Unicode character.
1345     *
1346     * Results:
1347     *      Returns 1 if character is alphabetic.
1348     *
1349     * Side effects:
1350     *      None.
1351     *
1352     *----------------------------------------------------------------------
1353     */
1354    
1355    int
1356    Tcl_UniCharIsAlpha(ch)
1357        int ch;                     /* Unicode character to test. */
1358    {
1359        register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1360        return ((ALPHA_BITS >> category) & 1);
1361    }
1362    
1363    /*
1364     *----------------------------------------------------------------------
1365     *
1366     * Tcl_UniCharIsControl --
1367     *
1368     *      Test if a character is a Unicode control character.
1369     *
1370     * Results:
1371     *      Returns non-zero if character is a control.
1372     *
1373     * Side effects:
1374     *      None.
1375     *
1376     *----------------------------------------------------------------------
1377     */
1378    
1379    int
1380    Tcl_UniCharIsControl(ch)
1381        int ch;                     /* Unicode character to test. */
1382    {
1383        return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1384    }
1385    
1386    /*
1387     *----------------------------------------------------------------------
1388     *
1389     * Tcl_UniCharIsDigit --
1390     *
1391     *      Test if a character is a numeric Unicode character.
1392     *
1393     * Results:
1394     *      Returns non-zero if character is a digit.
1395     *
1396     * Side effects:
1397     *      None.
1398     *
1399     *----------------------------------------------------------------------
1400     */
1401    
1402    int
1403    Tcl_UniCharIsDigit(ch)
1404        int ch;                     /* Unicode character to test. */
1405    {
1406        return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1407                == DECIMAL_DIGIT_NUMBER);
1408    }
1409    
1410    /*
1411     *----------------------------------------------------------------------
1412     *
1413     * Tcl_UniCharIsGraph --
1414     *
1415     *      Test if a character is any Unicode print character except space.
1416     *
1417     * Results:
1418     *      Returns non-zero if character is printable, but not space.
1419     *
1420     * Side effects:
1421     *      None.
1422     *
1423     *----------------------------------------------------------------------
1424     */
1425    
1426    int
1427    Tcl_UniCharIsGraph(ch)
1428        int ch;                     /* Unicode character to test. */
1429    {
1430        register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1431        return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1432    }
1433    
1434    /*
1435     *----------------------------------------------------------------------
1436     *
1437     * Tcl_UniCharIsLower --
1438     *
1439     *      Test if a character is a lowercase Unicode character.
1440     *
1441     * Results:
1442     *      Returns non-zero if character is lowercase.
1443     *
1444     * Side effects:
1445     *      None.
1446     *
1447     *----------------------------------------------------------------------
1448     */
1449    
1450    int
1451    Tcl_UniCharIsLower(ch)
1452        int ch;                     /* Unicode character to test. */
1453    {
1454        return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1455    }
1456    
1457    /*
1458     *----------------------------------------------------------------------
1459     *
1460     * Tcl_UniCharIsPrint --
1461     *
1462     *      Test if a character is a Unicode print character.
1463     *
1464     * Results:
1465     *      Returns non-zero if character is printable.
1466     *
1467     * Side effects:
1468     *      None.
1469     *
1470     *----------------------------------------------------------------------
1471     */
1472    
1473    int
1474    Tcl_UniCharIsPrint(ch)
1475        int ch;                     /* Unicode character to test. */
1476    {
1477        register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1478        return ((PRINT_BITS >> category) & 1);
1479    }
1480    
1481    /*
1482     *----------------------------------------------------------------------
1483     *
1484     * Tcl_UniCharIsPunct --
1485     *
1486     *      Test if a character is a Unicode punctuation character.
1487     *
1488     * Results:
1489     *      Returns non-zero if character is punct.
1490     *
1491     * Side effects:
1492     *      None.
1493     *
1494     *----------------------------------------------------------------------
1495     */
1496    
1497    int
1498    Tcl_UniCharIsPunct(ch)
1499        int ch;                     /* Unicode character to test. */
1500    {
1501        register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1502        return ((PUNCT_BITS >> category) & 1);
1503    }
1504    
1505    /*
1506     *----------------------------------------------------------------------
1507     *
1508     * Tcl_UniCharIsSpace --
1509     *
1510     *      Test if a character is a whitespace Unicode character.
1511     *
1512     * Results:
1513     *      Returns non-zero if character is a space.
1514     *
1515     * Side effects:
1516     *      None.
1517     *
1518     *----------------------------------------------------------------------
1519     */
1520    
1521    int
1522    Tcl_UniCharIsSpace(ch)
1523        int ch;                     /* Unicode character to test. */
1524    {
1525        register int category;
1526    
1527        /*
1528         * If the character is within the first 127 characters, just use the
1529         * standard C function, otherwise consult the Unicode table.
1530         */
1531    
1532        if (ch < 0x80) {
1533            return isspace(UCHAR(ch)); /* INTL: ISO space */
1534        } else {
1535            category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1536            return ((SPACE_BITS >> category) & 1);
1537        }
1538    }
1539    
1540    /*
1541     *----------------------------------------------------------------------
1542     *
1543     * Tcl_UniCharIsUpper --
1544     *
1545     *      Test if a character is a uppercase Unicode character.
1546     *
1547     * Results:
1548     *      Returns non-zero if character is uppercase.
1549     *
1550     * Side effects:
1551     *      None.
1552     *
1553     *----------------------------------------------------------------------
1554     */
1555    
1556    int
1557    Tcl_UniCharIsUpper(ch)
1558        int ch;                     /* Unicode character to test. */
1559    {
1560        return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1561    }
1562    
1563    /*
1564     *----------------------------------------------------------------------
1565     *
1566     * Tcl_UniCharIsWordChar --
1567     *
1568     *      Test if a character is alphanumeric or a connector punctuation
1569     *      mark.
1570     *
1571     * Results:
1572     *      Returns 1 if character is a word character.
1573     *
1574     * Side effects:
1575     *      None.
1576     *
1577     *----------------------------------------------------------------------
1578     */
1579    
1580    int
1581    Tcl_UniCharIsWordChar(ch)
1582        int ch;                     /* Unicode character to test. */
1583    {
1584        register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1585    
1586        return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1587    }
1588    
1589    /* End of tclutf.c */

Legend:
Removed from v.25  
changed lines
  Added in v.71

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25