|
/* $Header: /cvsroot/esrg/sfesrg/esrgpcpj/shared/tcl_base/tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $ */ |
|
|
|
|
|
/* |
|
|
* tclUtf.c -- |
|
|
* |
|
|
* Routines for manipulating UTF-8 strings. |
|
|
* |
|
|
* Copyright (c) 1997-1998 Sun Microsystems, Inc. |
|
|
* |
|
|
* See the file "license.terms" for information on usage and redistribution |
|
|
* of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
|
|
* |
|
|
* RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $ |
|
|
*/ |
|
|
|
|
|
#include "tclInt.h" |
|
|
|
|
|
/* |
|
|
* Include the static character classification tables and macros. |
|
|
*/ |
|
|
|
|
|
#include "tclUniData.c" |
|
|
|
|
|
/* |
|
|
* The following macros are used for fast character category tests. The |
|
|
* x_BITS values are shifted right by the category value to determine whether |
|
|
* the given category is included in the set. |
|
|
*/ |
|
|
|
|
|
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ |
|
|
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) |
|
|
|
|
|
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) |
|
|
|
|
|
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ |
|
|
| (1 << PARAGRAPH_SEPARATOR)) |
|
|
|
|
|
#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) |
|
|
|
|
|
#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ |
|
|
(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ |
|
|
(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ |
|
|
(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ |
|
|
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ |
|
|
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ |
|
|
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ |
|
|
(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ |
|
|
(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) |
|
|
|
|
|
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ |
|
|
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ |
|
|
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ |
|
|
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) |
|
|
|
|
|
/* |
|
|
* Unicode characters less than this value are represented by themselves |
|
|
* in UTF-8 strings. |
|
|
*/ |
|
|
|
|
|
#define UNICODE_SELF 0x80 |
|
|
|
|
|
/* |
|
|
* The following structures are used when mapping between Unicode (UCS-2) |
|
|
* and UTF-8. |
|
|
*/ |
|
|
|
|
|
CONST unsigned char totalBytes[256] = { |
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, |
|
|
#if TCL_UTF_MAX > 3 |
|
|
4,4,4,4,4,4,4,4, |
|
|
#else |
|
|
1,1,1,1,1,1,1,1, |
|
|
#endif |
|
|
#if TCL_UTF_MAX > 4 |
|
|
5,5,5,5, |
|
|
#else |
|
|
1,1,1,1, |
|
|
#endif |
|
|
#if TCL_UTF_MAX > 5 |
|
|
6,6,6,6 |
|
|
#else |
|
|
1,1,1,1 |
|
|
#endif |
|
|
}; |
|
|
|
|
|
/* |
|
|
* Procedures used only in this module. |
|
|
*/ |
|
|
|
|
|
static int UtfCount _ANSI_ARGS_((int ch)); |
|
|
|
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* UtfCount -- |
|
|
* |
|
|
* Find the number of bytes in the Utf character "ch". |
|
|
* |
|
|
* Results: |
|
|
* The return values is the number of bytes in the Utf character "ch". |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
static int |
|
|
UtfCount(ch) |
|
|
int ch; /* The Tcl_UniChar whose size is returned. */ |
|
|
{ |
|
|
if ((ch > 0) && (ch < UNICODE_SELF)) { |
|
|
return 1; |
|
|
} |
|
|
if (ch <= 0x7FF) { |
|
|
return 2; |
|
|
} |
|
|
if (ch <= 0xFFFF) { |
|
|
return 3; |
|
|
} |
|
|
#if TCL_UTF_MAX > 3 |
|
|
if (ch <= 0x1FFFFF) { |
|
|
return 4; |
|
|
} |
|
|
if (ch <= 0x3FFFFFF) { |
|
|
return 5; |
|
|
} |
|
|
if (ch <= 0x7FFFFFFF) { |
|
|
return 6; |
|
|
} |
|
|
#endif |
|
|
return 3; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharToUtf -- |
|
|
* |
|
|
* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the |
|
|
* provided buffer. Equivalent to Plan 9 runetochar(). |
|
|
* |
|
|
* Results: |
|
|
* The return values is the number of bytes in the buffer that |
|
|
* were consumed. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
INLINE int |
|
|
Tcl_UniCharToUtf(ch, str) |
|
|
int ch; /* The Tcl_UniChar to be stored in the |
|
|
* buffer. */ |
|
|
char *str; /* Buffer in which the UTF-8 representation |
|
|
* of the Tcl_UniChar is stored. Buffer must |
|
|
* be large enough to hold the UTF-8 character |
|
|
* (at most TCL_UTF_MAX bytes). */ |
|
|
{ |
|
|
if ((ch > 0) && (ch < UNICODE_SELF)) { |
|
|
str[0] = (char) ch; |
|
|
return 1; |
|
|
} |
|
|
if (ch <= 0x7FF) { |
|
|
str[1] = (char) ((ch | 0x80) & 0xBF); |
|
|
str[0] = (char) ((ch >> 6) | 0xC0); |
|
|
return 2; |
|
|
} |
|
|
if (ch <= 0xFFFF) { |
|
|
three: |
|
|
str[2] = (char) ((ch | 0x80) & 0xBF); |
|
|
str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); |
|
|
str[0] = (char) ((ch >> 12) | 0xE0); |
|
|
return 3; |
|
|
} |
|
|
|
|
|
#if TCL_UTF_MAX > 3 |
|
|
if (ch <= 0x1FFFFF) { |
|
|
str[3] = (char) ((ch | 0x80) & 0xBF); |
|
|
str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); |
|
|
str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); |
|
|
str[0] = (char) ((ch >> 18) | 0xF0); |
|
|
return 4; |
|
|
} |
|
|
if (ch <= 0x3FFFFFF) { |
|
|
str[4] = (char) ((ch | 0x80) & 0xBF); |
|
|
str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); |
|
|
str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); |
|
|
str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); |
|
|
str[0] = (char) ((ch >> 24) | 0xF8); |
|
|
return 5; |
|
|
} |
|
|
if (ch <= 0x7FFFFFFF) { |
|
|
str[5] = (char) ((ch | 0x80) & 0xBF); |
|
|
str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); |
|
|
str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); |
|
|
str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); |
|
|
str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); |
|
|
str[0] = (char) ((ch >> 30) | 0xFC); |
|
|
return 6; |
|
|
} |
|
|
#endif |
|
|
|
|
|
ch = 0xFFFD; |
|
|
goto three; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharToUtfDString -- |
|
|
* |
|
|
* Convert the given Unicode string to UTF-8. |
|
|
* |
|
|
* Results: |
|
|
* The return value is a pointer to the UTF-8 representation of the |
|
|
* Unicode string. Storage for the return value is appended to the |
|
|
* end of dsPtr. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
char * |
|
|
Tcl_UniCharToUtfDString(wString, numChars, dsPtr) |
|
|
CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ |
|
|
int numChars; /* Length of Unicode string in Tcl_UniChars |
|
|
* (must be >= 0). */ |
|
|
Tcl_DString *dsPtr; /* UTF-8 representation of string is |
|
|
* appended to this previously initialized |
|
|
* DString. */ |
|
|
{ |
|
|
CONST Tcl_UniChar *w, *wEnd; |
|
|
char *p, *string; |
|
|
int oldLength; |
|
|
|
|
|
/* |
|
|
* UTF-8 string length in bytes will be <= Unicode string length * |
|
|
* TCL_UTF_MAX. |
|
|
*/ |
|
|
|
|
|
oldLength = Tcl_DStringLength(dsPtr); |
|
|
Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); |
|
|
string = Tcl_DStringValue(dsPtr) + oldLength; |
|
|
|
|
|
p = string; |
|
|
wEnd = wString + numChars; |
|
|
for (w = wString; w < wEnd; ) { |
|
|
p += Tcl_UniCharToUtf(*w, p); |
|
|
w++; |
|
|
} |
|
|
Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); |
|
|
|
|
|
return string; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfToUniChar -- |
|
|
* |
|
|
* Extract the Tcl_UniChar represented by the UTF-8 string. Bad |
|
|
* UTF-8 sequences are converted to valid Tcl_UniChars and processing |
|
|
* continues. Equivalent to Plan 9 chartorune(). |
|
|
* |
|
|
* The caller must ensure that the source buffer is long enough that |
|
|
* this routine does not run off the end and dereference non-existent |
|
|
* memory looking for trail bytes. If the source buffer is known to |
|
|
* be '\0' terminated, this cannot happen. Otherwise, the caller |
|
|
* should call Tcl_UtfCharComplete() before calling this routine to |
|
|
* ensure that enough bytes remain in the string. |
|
|
* |
|
|
* Results: |
|
|
* *chPtr is filled with the Tcl_UniChar, and the return value is the |
|
|
* number of bytes from the UTF-8 string that were consumed. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfToUniChar(str, chPtr) |
|
|
register CONST char *str; /* The UTF-8 string. */ |
|
|
register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented |
|
|
* by the UTF-8 string. */ |
|
|
{ |
|
|
register int byte; |
|
|
|
|
|
/* |
|
|
* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. |
|
|
*/ |
|
|
|
|
|
byte = *((unsigned char *) str); |
|
|
if (byte < 0xC0) { |
|
|
/* |
|
|
* Handles properly formed UTF-8 characters between 0x01 and 0x7F. |
|
|
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid |
|
|
* characters representing themselves. |
|
|
*/ |
|
|
|
|
|
*chPtr = (Tcl_UniChar) byte; |
|
|
return 1; |
|
|
} else if (byte < 0xE0) { |
|
|
if ((str[1] & 0xC0) == 0x80) { |
|
|
/* |
|
|
* Two-byte-character lead-byte followed by a trail-byte. |
|
|
*/ |
|
|
|
|
|
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); |
|
|
return 2; |
|
|
} |
|
|
/* |
|
|
* A two-byte-character lead-byte not followed by trail-byte |
|
|
* represents itself. |
|
|
*/ |
|
|
|
|
|
*chPtr = (Tcl_UniChar) byte; |
|
|
return 1; |
|
|
} else if (byte < 0xF0) { |
|
|
if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { |
|
|
/* |
|
|
* Three-byte-character lead byte followed by two trail bytes. |
|
|
*/ |
|
|
|
|
|
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) |
|
|
| ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); |
|
|
return 3; |
|
|
} |
|
|
/* |
|
|
* A three-byte-character lead-byte not followed by two trail-bytes |
|
|
* represents itself. |
|
|
*/ |
|
|
|
|
|
*chPtr = (Tcl_UniChar) byte; |
|
|
return 1; |
|
|
} |
|
|
#if TCL_UTF_MAX > 3 |
|
|
else { |
|
|
int ch, total, trail; |
|
|
|
|
|
total = totalBytes[byte]; |
|
|
trail = total - 1; |
|
|
if (trail > 0) { |
|
|
ch = byte & (0x3F >> trail); |
|
|
do { |
|
|
str++; |
|
|
if ((*str & 0xC0) != 0x80) { |
|
|
*chPtr = byte; |
|
|
return 1; |
|
|
} |
|
|
ch <<= 6; |
|
|
ch |= (*str & 0x3F); |
|
|
trail--; |
|
|
} while (trail > 0); |
|
|
*chPtr = ch; |
|
|
return total; |
|
|
} |
|
|
} |
|
|
#endif |
|
|
|
|
|
*chPtr = (Tcl_UniChar) byte; |
|
|
return 1; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfToUniCharDString -- |
|
|
* |
|
|
* Convert the UTF-8 string to Unicode. |
|
|
* |
|
|
* Results: |
|
|
* The return value is a pointer to the Unicode representation of the |
|
|
* UTF-8 string. Storage for the return value is appended to the |
|
|
* end of dsPtr. The Unicode string is terminated with a Unicode |
|
|
* NULL character. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
Tcl_UniChar * |
|
|
Tcl_UtfToUniCharDString(string, length, dsPtr) |
|
|
CONST char *string; /* UTF-8 string to convert to Unicode. */ |
|
|
int length; /* Length of UTF-8 string in bytes, or -1 |
|
|
* for strlen(). */ |
|
|
Tcl_DString *dsPtr; /* Unicode representation of string is |
|
|
* appended to this previously initialized |
|
|
* DString. */ |
|
|
{ |
|
|
Tcl_UniChar *w, *wString; |
|
|
CONST char *p, *end; |
|
|
int oldLength; |
|
|
|
|
|
if (length < 0) { |
|
|
length = strlen(string); |
|
|
} |
|
|
|
|
|
/* |
|
|
* Unicode string length in Tcl_UniChars will be <= UTF-8 string length |
|
|
* in bytes. |
|
|
*/ |
|
|
|
|
|
oldLength = Tcl_DStringLength(dsPtr); |
|
|
Tcl_DStringSetLength(dsPtr, |
|
|
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); |
|
|
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); |
|
|
|
|
|
w = wString; |
|
|
end = string + length; |
|
|
for (p = string; p < end; ) { |
|
|
p += Tcl_UtfToUniChar(p, w); |
|
|
w++; |
|
|
} |
|
|
*w = '\0'; |
|
|
Tcl_DStringSetLength(dsPtr, |
|
|
(oldLength + ((char *) w - (char *) wString))); |
|
|
|
|
|
return wString; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfCharComplete -- |
|
|
* |
|
|
* Determine if the UTF-8 string of the given length is long enough |
|
|
* to be decoded by Tcl_UtfToUniChar(). This does not ensure that the |
|
|
* UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). |
|
|
* |
|
|
* Results: |
|
|
* The return value is 0 if the string is not long enough, non-zero |
|
|
* otherwise. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfCharComplete(str, len) |
|
|
CONST char *str; /* String to check if first few bytes |
|
|
* contain a complete UTF-8 character. */ |
|
|
int len; /* Length of above string in bytes. */ |
|
|
{ |
|
|
int ch; |
|
|
|
|
|
ch = *((unsigned char *) str); |
|
|
return len >= totalBytes[ch]; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_NumUtfChars -- |
|
|
* |
|
|
* Returns the number of characters (not bytes) in the UTF-8 string, |
|
|
* not including the terminating NULL byte. This is equivalent to |
|
|
* Plan 9 utflen() and utfnlen(). |
|
|
* |
|
|
* Results: |
|
|
* As above. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_NumUtfChars(str, len) |
|
|
register CONST char *str; /* The UTF-8 string to measure. */ |
|
|
int len; /* The length of the string in bytes, or -1 |
|
|
* for strlen(string). */ |
|
|
{ |
|
|
Tcl_UniChar ch; |
|
|
register Tcl_UniChar *chPtr = &ch; |
|
|
register int n; |
|
|
int i; |
|
|
|
|
|
/* |
|
|
* The separate implementations are faster. |
|
|
*/ |
|
|
|
|
|
i = 0; |
|
|
if (len < 0) { |
|
|
while (1) { |
|
|
str += Tcl_UtfToUniChar(str, chPtr); |
|
|
if (ch == '\0') { |
|
|
break; |
|
|
} |
|
|
i++; |
|
|
} |
|
|
} else { |
|
|
while (len > 0) { |
|
|
n = Tcl_UtfToUniChar(str, chPtr); |
|
|
len -= n; |
|
|
str += n; |
|
|
i++; |
|
|
} |
|
|
} |
|
|
return i; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfFindFirst -- |
|
|
* |
|
|
* Returns a pointer to the first occurance of the given Tcl_UniChar |
|
|
* in the NULL-terminated UTF-8 string. The NULL terminator is |
|
|
* considered part of the UTF-8 string. Equivalent to Plan 9 |
|
|
* utfrune(). |
|
|
* |
|
|
* Results: |
|
|
* As above. If the Tcl_UniChar does not exist in the given string, |
|
|
* the return value is NULL. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
char * |
|
|
Tcl_UtfFindFirst(string, ch) |
|
|
CONST char *string; /* The UTF-8 string to be searched. */ |
|
|
int ch; /* The Tcl_UniChar to search for. */ |
|
|
{ |
|
|
int len; |
|
|
Tcl_UniChar find; |
|
|
|
|
|
while (1) { |
|
|
len = Tcl_UtfToUniChar(string, &find); |
|
|
if (find == ch) { |
|
|
return (char *) string; |
|
|
} |
|
|
if (*string == '\0') { |
|
|
return NULL; |
|
|
} |
|
|
string += len; |
|
|
} |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfFindLast -- |
|
|
* |
|
|
* Returns a pointer to the last occurance of the given Tcl_UniChar |
|
|
* in the NULL-terminated UTF-8 string. The NULL terminator is |
|
|
* considered part of the UTF-8 string. Equivalent to Plan 9 |
|
|
* utfrrune(). |
|
|
* |
|
|
* Results: |
|
|
* As above. If the Tcl_UniChar does not exist in the given string, |
|
|
* the return value is NULL. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
char * |
|
|
Tcl_UtfFindLast(string, ch) |
|
|
CONST char *string; /* The UTF-8 string to be searched. */ |
|
|
int ch; /* The Tcl_UniChar to search for. */ |
|
|
{ |
|
|
int len; |
|
|
Tcl_UniChar find; |
|
|
CONST char *last; |
|
|
|
|
|
last = NULL; |
|
|
while (1) { |
|
|
len = Tcl_UtfToUniChar(string, &find); |
|
|
if (find == ch) { |
|
|
last = string; |
|
|
} |
|
|
if (*string == '\0') { |
|
|
break; |
|
|
} |
|
|
string += len; |
|
|
} |
|
|
return (char *) last; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfNext -- |
|
|
* |
|
|
* Given a pointer to some current location in a UTF-8 string, |
|
|
* move forward one character. The caller must ensure that they |
|
|
* are not asking for the next character after the last character |
|
|
* in the string. |
|
|
* |
|
|
* Results: |
|
|
* The return value is the pointer to the next character in |
|
|
* the UTF-8 string. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
char * |
|
|
Tcl_UtfNext(str) |
|
|
CONST char *str; /* The current location in the string. */ |
|
|
{ |
|
|
Tcl_UniChar ch; |
|
|
|
|
|
return (char *) str + Tcl_UtfToUniChar(str, &ch); |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfPrev -- |
|
|
* |
|
|
* Given a pointer to some current location in a UTF-8 string, |
|
|
* move backwards one character. |
|
|
* |
|
|
* Results: |
|
|
* The return value is a pointer to the previous character in the |
|
|
* UTF-8 string. If the current location was already at the |
|
|
* beginning of the string, the return value will also be a |
|
|
* pointer to the beginning of the string. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
char * |
|
|
Tcl_UtfPrev(str, start) |
|
|
CONST char *str; /* The current location in the string. */ |
|
|
CONST char *start; /* Pointer to the beginning of the |
|
|
* string, to avoid going backwards too |
|
|
* far. */ |
|
|
{ |
|
|
CONST char *look; |
|
|
int i, byte; |
|
|
|
|
|
str--; |
|
|
look = str; |
|
|
for (i = 0; i < TCL_UTF_MAX; i++) { |
|
|
if (look < start) { |
|
|
if (str < start) { |
|
|
str = start; |
|
|
} |
|
|
break; |
|
|
} |
|
|
byte = *((unsigned char *) look); |
|
|
if (byte < 0x80) { |
|
|
break; |
|
|
} |
|
|
if (byte >= 0xC0) { |
|
|
if (totalBytes[byte] != i + 1) { |
|
|
break; |
|
|
} |
|
|
return (char *) look; |
|
|
} |
|
|
look--; |
|
|
} |
|
|
return (char *) str; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharAtIndex -- |
|
|
* |
|
|
* Returns the Unicode character represented at the specified |
|
|
* character (not byte) position in the UTF-8 string. |
|
|
* |
|
|
* Results: |
|
|
* As above. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
Tcl_UniChar |
|
|
Tcl_UniCharAtIndex(src, index) |
|
|
register CONST char *src; /* The UTF-8 string to dereference. */ |
|
|
register int index; /* The position of the desired character. */ |
|
|
{ |
|
|
Tcl_UniChar ch; |
|
|
|
|
|
while (index >= 0) { |
|
|
index--; |
|
|
src += Tcl_UtfToUniChar(src, &ch); |
|
|
} |
|
|
return ch; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfAtIndex -- |
|
|
* |
|
|
* Returns a pointer to the specified character (not byte) position |
|
|
* in the UTF-8 string. |
|
|
* |
|
|
* Results: |
|
|
* As above. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
char * |
|
|
Tcl_UtfAtIndex(src, index) |
|
|
register CONST char *src; /* The UTF-8 string. */ |
|
|
register int index; /* The position of the desired character. */ |
|
|
{ |
|
|
Tcl_UniChar ch; |
|
|
|
|
|
while (index > 0) { |
|
|
index--; |
|
|
src += Tcl_UtfToUniChar(src, &ch); |
|
|
} |
|
|
return (char *) src; |
|
|
} |
|
|
|
|
|
/* |
|
|
*--------------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfBackslash -- |
|
|
* |
|
|
* Figure out how to handle a backslash sequence. |
|
|
* |
|
|
* Results: |
|
|
* Stores the bytes represented by the backslash sequence in dst and |
|
|
* returns the number of bytes written to dst. At most TCL_UTF_MAX |
|
|
* bytes are written to dst; dst must have been large enough to accept |
|
|
* those bytes. If readPtr isn't NULL then it is filled in with a |
|
|
* count of the number of bytes in the backslash sequence. |
|
|
* |
|
|
* Side effects: |
|
|
* The maximum number of bytes it takes to represent a Unicode |
|
|
* character in UTF-8 is guaranteed to be less than the number of |
|
|
* bytes used to express the backslash sequence that represents |
|
|
* that Unicode character. If the target buffer into which the |
|
|
* caller is going to store the bytes that represent the Unicode |
|
|
* character is at least as large as the source buffer from which |
|
|
* the backslashed sequence was extracted, no buffer overruns should |
|
|
* occur. |
|
|
* |
|
|
*--------------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfBackslash(src, readPtr, dst) |
|
|
CONST char *src; /* Points to the backslash character of |
|
|
* a backslash sequence. */ |
|
|
int *readPtr; /* Fill in with number of characters read |
|
|
* from src, unless NULL. */ |
|
|
char *dst; /* Filled with the bytes represented by the |
|
|
* backslash sequence. */ |
|
|
{ |
|
|
register CONST char *p = src+1; |
|
|
int result, count, n; |
|
|
char buf[TCL_UTF_MAX]; |
|
|
|
|
|
if (dst == NULL) { |
|
|
dst = buf; |
|
|
} |
|
|
|
|
|
count = 2; |
|
|
switch (*p) { |
|
|
/* |
|
|
* Note: in the conversions below, use absolute values (e.g., |
|
|
* 0xa) rather than symbolic values (e.g. \n) that get converted |
|
|
* by the compiler. It's possible that compilers on some |
|
|
* platforms will do the symbolic conversions differently, which |
|
|
* could result in non-portable Tcl scripts. |
|
|
*/ |
|
|
|
|
|
case 'a': |
|
|
result = 0x7; |
|
|
break; |
|
|
case 'b': |
|
|
result = 0x8; |
|
|
break; |
|
|
case 'f': |
|
|
result = 0xc; |
|
|
break; |
|
|
case 'n': |
|
|
result = 0xa; |
|
|
break; |
|
|
case 'r': |
|
|
result = 0xd; |
|
|
break; |
|
|
case 't': |
|
|
result = 0x9; |
|
|
break; |
|
|
case 'v': |
|
|
result = 0xb; |
|
|
break; |
|
|
case 'x': |
|
|
if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ |
|
|
char *end; |
|
|
|
|
|
result = (unsigned char) strtoul(p+1, &end, 16); |
|
|
count = end - src; |
|
|
} else { |
|
|
count = 2; |
|
|
result = 'x'; |
|
|
} |
|
|
break; |
|
|
case 'u': |
|
|
result = 0; |
|
|
for (count = 0; count < 4; count++) { |
|
|
p++; |
|
|
if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ |
|
|
break; |
|
|
} |
|
|
n = *p - '0'; |
|
|
if (n > 9) { |
|
|
n = n + '0' + 10 - 'A'; |
|
|
} |
|
|
if (n > 16) { |
|
|
n = n + 'A' - 'a'; |
|
|
} |
|
|
result = (result << 4) + n; |
|
|
} |
|
|
if (count == 0) { |
|
|
result = 'u'; |
|
|
} |
|
|
count += 2; |
|
|
break; |
|
|
|
|
|
case '\n': |
|
|
do { |
|
|
p++; |
|
|
} while ((*p == ' ') || (*p == '\t')); |
|
|
result = ' '; |
|
|
count = p - src; |
|
|
break; |
|
|
case 0: |
|
|
result = '\\'; |
|
|
count = 1; |
|
|
break; |
|
|
default: |
|
|
/* |
|
|
* Check for an octal number \oo?o? |
|
|
*/ |
|
|
if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */ |
|
|
result = (unsigned char)(*p - '0'); |
|
|
p++; |
|
|
if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ |
|
|
break; |
|
|
} |
|
|
count = 3; |
|
|
result = (unsigned char)((result << 3) + (*p - '0')); |
|
|
p++; |
|
|
if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ |
|
|
break; |
|
|
} |
|
|
count = 4; |
|
|
result = (unsigned char)((result << 3) + (*p - '0')); |
|
|
break; |
|
|
} |
|
|
result = *p; |
|
|
count = 2; |
|
|
break; |
|
|
} |
|
|
|
|
|
if (readPtr != NULL) { |
|
|
*readPtr = count; |
|
|
} |
|
|
return Tcl_UniCharToUtf(result, dst); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfToUpper -- |
|
|
* |
|
|
* Convert lowercase characters to uppercase characters in a UTF |
|
|
* string in place. The conversion may shrink the UTF string. |
|
|
* |
|
|
* Results: |
|
|
* Returns the number of bytes in the resulting string |
|
|
* excluding the trailing null. |
|
|
* |
|
|
* Side effects: |
|
|
* Writes a terminating null after the last converted character. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfToUpper(str) |
|
|
char *str; /* String to convert in place. */ |
|
|
{ |
|
|
Tcl_UniChar ch, upChar; |
|
|
char *src, *dst; |
|
|
int bytes; |
|
|
|
|
|
/* |
|
|
* Iterate over the string until we hit the terminating null. |
|
|
*/ |
|
|
|
|
|
src = dst = str; |
|
|
while (*src) { |
|
|
bytes = Tcl_UtfToUniChar(src, &ch); |
|
|
upChar = Tcl_UniCharToUpper(ch); |
|
|
|
|
|
/* |
|
|
* To keep badly formed Utf strings from getting inflated by |
|
|
* the conversion (thereby causing a segfault), only copy the |
|
|
* upper case char to dst if its size is <= the original char. |
|
|
*/ |
|
|
|
|
|
if (bytes < UtfCount(upChar)) { |
|
|
memcpy(dst, src, (size_t) bytes); |
|
|
dst += bytes; |
|
|
} else { |
|
|
dst += Tcl_UniCharToUtf(upChar, dst); |
|
|
} |
|
|
src += bytes; |
|
|
} |
|
|
*dst = '\0'; |
|
|
return (dst - str); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfToLower -- |
|
|
* |
|
|
* Convert uppercase characters to lowercase characters in a UTF |
|
|
* string in place. The conversion may shrink the UTF string. |
|
|
* |
|
|
* Results: |
|
|
* Returns the number of bytes in the resulting string |
|
|
* excluding the trailing null. |
|
|
* |
|
|
* Side effects: |
|
|
* Writes a terminating null after the last converted character. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfToLower(str) |
|
|
char *str; /* String to convert in place. */ |
|
|
{ |
|
|
Tcl_UniChar ch, lowChar; |
|
|
char *src, *dst; |
|
|
int bytes; |
|
|
|
|
|
/* |
|
|
* Iterate over the string until we hit the terminating null. |
|
|
*/ |
|
|
|
|
|
src = dst = str; |
|
|
while (*src) { |
|
|
bytes = Tcl_UtfToUniChar(src, &ch); |
|
|
lowChar = Tcl_UniCharToLower(ch); |
|
|
|
|
|
/* |
|
|
* To keep badly formed Utf strings from getting inflated by |
|
|
* the conversion (thereby causing a segfault), only copy the |
|
|
* lower case char to dst if its size is <= the original char. |
|
|
*/ |
|
|
|
|
|
if (bytes < UtfCount(lowChar)) { |
|
|
memcpy(dst, src, (size_t) bytes); |
|
|
dst += bytes; |
|
|
} else { |
|
|
dst += Tcl_UniCharToUtf(lowChar, dst); |
|
|
} |
|
|
src += bytes; |
|
|
} |
|
|
*dst = '\0'; |
|
|
return (dst - str); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfToTitle -- |
|
|
* |
|
|
* Changes the first character of a UTF string to title case or |
|
|
* uppercase and the rest of the string to lowercase. The |
|
|
* conversion happens in place and may shrink the UTF string. |
|
|
* |
|
|
* Results: |
|
|
* Returns the number of bytes in the resulting string |
|
|
* excluding the trailing null. |
|
|
* |
|
|
* Side effects: |
|
|
* Writes a terminating null after the last converted character. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfToTitle(str) |
|
|
char *str; /* String to convert in place. */ |
|
|
{ |
|
|
Tcl_UniChar ch, titleChar, lowChar; |
|
|
char *src, *dst; |
|
|
int bytes; |
|
|
|
|
|
/* |
|
|
* Capitalize the first character and then lowercase the rest of the |
|
|
* characters until we get to a null. |
|
|
*/ |
|
|
|
|
|
src = dst = str; |
|
|
|
|
|
if (*src) { |
|
|
bytes = Tcl_UtfToUniChar(src, &ch); |
|
|
titleChar = Tcl_UniCharToTitle(ch); |
|
|
|
|
|
if (bytes < UtfCount(titleChar)) { |
|
|
memcpy(dst, src, (size_t) bytes); |
|
|
dst += bytes; |
|
|
} else { |
|
|
dst += Tcl_UniCharToUtf(titleChar, dst); |
|
|
} |
|
|
src += bytes; |
|
|
} |
|
|
while (*src) { |
|
|
bytes = Tcl_UtfToUniChar(src, &ch); |
|
|
lowChar = Tcl_UniCharToLower(ch); |
|
|
|
|
|
if (bytes < UtfCount(lowChar)) { |
|
|
memcpy(dst, src, (size_t) bytes); |
|
|
dst += bytes; |
|
|
} else { |
|
|
dst += Tcl_UniCharToUtf(lowChar, dst); |
|
|
} |
|
|
src += bytes; |
|
|
} |
|
|
*dst = '\0'; |
|
|
return (dst - str); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfNcmp -- |
|
|
* |
|
|
* Compare at most n UTF chars of string cs to string ct. Both cs |
|
|
* and ct are assumed to be at least n UTF chars long. |
|
|
* |
|
|
* Results: |
|
|
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfNcmp(cs, ct, n) |
|
|
CONST char *cs; /* UTF string to compare to ct. */ |
|
|
CONST char *ct; /* UTF string cs is compared to. */ |
|
|
unsigned long n; /* Number of UTF chars to compare. */ |
|
|
{ |
|
|
Tcl_UniChar ch1, ch2; |
|
|
/* |
|
|
* Another approach that should work is: |
|
|
* return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs)); |
|
|
* That assumes that ct is a properly formed UTF, so we will just |
|
|
* be comparing the bytes that compromise those strings to the |
|
|
* char length n. |
|
|
*/ |
|
|
while (n-- > 0) { |
|
|
/* |
|
|
* n must be interpreted as chars, not bytes. |
|
|
* This should be called only when both strings are of |
|
|
* at least n chars long (no need for \0 check) |
|
|
*/ |
|
|
cs += Tcl_UtfToUniChar(cs, &ch1); |
|
|
ct += Tcl_UtfToUniChar(ct, &ch2); |
|
|
if (ch1 != ch2) { |
|
|
return (ch1 - ch2); |
|
|
} |
|
|
} |
|
|
return 0; |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UtfNcasecmp -- |
|
|
* |
|
|
* Compare at most n UTF chars of string cs to string ct case |
|
|
* insensitive. Both cs and ct are assumed to be at least n |
|
|
* UTF chars long. |
|
|
* |
|
|
* Results: |
|
|
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UtfNcasecmp(cs, ct, n) |
|
|
CONST char *cs; /* UTF string to compare to ct. */ |
|
|
CONST char *ct; /* UTF string cs is compared to. */ |
|
|
unsigned long n; /* Number of UTF chars to compare. */ |
|
|
{ |
|
|
Tcl_UniChar ch1, ch2; |
|
|
while (n-- > 0) { |
|
|
/* |
|
|
* n must be interpreted as chars, not bytes. |
|
|
* This should be called only when both strings are of |
|
|
* at least n chars long (no need for \0 check) |
|
|
*/ |
|
|
cs += Tcl_UtfToUniChar(cs, &ch1); |
|
|
ct += Tcl_UtfToUniChar(ct, &ch2); |
|
|
if (ch1 != ch2) { |
|
|
ch1 = Tcl_UniCharToLower(ch1); |
|
|
ch2 = Tcl_UniCharToLower(ch2); |
|
|
if (ch1 != ch2) { |
|
|
return (ch1 - ch2); |
|
|
} |
|
|
} |
|
|
} |
|
|
return 0; |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharToUpper -- |
|
|
* |
|
|
* Compute the uppercase equivalent of the given Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns the uppercase Unicode character. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
Tcl_UniChar |
|
|
Tcl_UniCharToUpper(ch) |
|
|
int ch; /* Unicode character to convert. */ |
|
|
{ |
|
|
int info = GetUniCharInfo(ch); |
|
|
|
|
|
if (GetCaseType(info) & 0x04) { |
|
|
return (Tcl_UniChar) (ch - GetDelta(info)); |
|
|
} else { |
|
|
return ch; |
|
|
} |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharToLower -- |
|
|
* |
|
|
* Compute the lowercase equivalent of the given Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns the lowercase Unicode character. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
Tcl_UniChar |
|
|
Tcl_UniCharToLower(ch) |
|
|
int ch; /* Unicode character to convert. */ |
|
|
{ |
|
|
int info = GetUniCharInfo(ch); |
|
|
|
|
|
if (GetCaseType(info) & 0x02) { |
|
|
return (Tcl_UniChar) (ch + GetDelta(info)); |
|
|
} else { |
|
|
return ch; |
|
|
} |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharToTitle -- |
|
|
* |
|
|
* Compute the titlecase equivalent of the given Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns the titlecase Unicode character. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
Tcl_UniChar |
|
|
Tcl_UniCharToTitle(ch) |
|
|
int ch; /* Unicode character to convert. */ |
|
|
{ |
|
|
int info = GetUniCharInfo(ch); |
|
|
int mode = GetCaseType(info); |
|
|
|
|
|
if (mode & 0x1) { |
|
|
/* |
|
|
* Subtract or add one depending on the original case. |
|
|
*/ |
|
|
|
|
|
return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); |
|
|
} else if (mode == 0x4) { |
|
|
return (Tcl_UniChar) (ch - GetDelta(info)); |
|
|
} else { |
|
|
return ch; |
|
|
} |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharLen -- |
|
|
* |
|
|
* Find the length of a UniChar string. The str input must be null |
|
|
* terminated. |
|
|
* |
|
|
* Results: |
|
|
* Returns the length of str in UniChars (not bytes). |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharLen(str) |
|
|
Tcl_UniChar *str; /* Unicode string to find length of. */ |
|
|
{ |
|
|
int len = 0; |
|
|
|
|
|
while (*str != '\0') { |
|
|
len++; |
|
|
str++; |
|
|
} |
|
|
return len; |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharNcmp -- |
|
|
* |
|
|
* Compare at most n unichars of string cs to string ct. Both cs |
|
|
* and ct are assumed to be at least n unichars long. |
|
|
* |
|
|
* Results: |
|
|
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharNcmp(cs, ct, n) |
|
|
CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ |
|
|
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ |
|
|
unsigned long n; /* Number of unichars to compare. */ |
|
|
{ |
|
|
for ( ; n != 0; n--, cs++, ct++) { |
|
|
if (*cs != *ct) { |
|
|
return *cs - *ct; |
|
|
} |
|
|
if (*cs == '\0') { |
|
|
break; |
|
|
} |
|
|
} |
|
|
return 0; |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsAlnum -- |
|
|
* |
|
|
* Test if a character is an alphanumeric Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns 1 if character is alphanumeric. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsAlnum(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
|
|
|
return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsAlpha -- |
|
|
* |
|
|
* Test if a character is an alphabetic Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns 1 if character is alphabetic. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsAlpha(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
return ((ALPHA_BITS >> category) & 1); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsControl -- |
|
|
* |
|
|
* Test if a character is a Unicode control character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is a control. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsControl(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsDigit -- |
|
|
* |
|
|
* Test if a character is a numeric Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is a digit. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsDigit(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) |
|
|
== DECIMAL_DIGIT_NUMBER); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsGraph -- |
|
|
* |
|
|
* Test if a character is any Unicode print character except space. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is printable, but not space. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsGraph(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsLower -- |
|
|
* |
|
|
* Test if a character is a lowercase Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is lowercase. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsLower(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsPrint -- |
|
|
* |
|
|
* Test if a character is a Unicode print character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is printable. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsPrint(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
return ((PRINT_BITS >> category) & 1); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsPunct -- |
|
|
* |
|
|
* Test if a character is a Unicode punctuation character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is punct. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsPunct(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
return ((PUNCT_BITS >> category) & 1); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsSpace -- |
|
|
* |
|
|
* Test if a character is a whitespace Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is a space. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsSpace(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category; |
|
|
|
|
|
/* |
|
|
* If the character is within the first 127 characters, just use the |
|
|
* standard C function, otherwise consult the Unicode table. |
|
|
*/ |
|
|
|
|
|
if (ch < 0x80) { |
|
|
return isspace(UCHAR(ch)); /* INTL: ISO space */ |
|
|
} else { |
|
|
category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
return ((SPACE_BITS >> category) & 1); |
|
|
} |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsUpper -- |
|
|
* |
|
|
* Test if a character is a uppercase Unicode character. |
|
|
* |
|
|
* Results: |
|
|
* Returns non-zero if character is uppercase. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsUpper(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); |
|
|
} |
|
|
|
|
|
/* |
|
|
*---------------------------------------------------------------------- |
|
|
* |
|
|
* Tcl_UniCharIsWordChar -- |
|
|
* |
|
|
* Test if a character is alphanumeric or a connector punctuation |
|
|
* mark. |
|
|
* |
|
|
* Results: |
|
|
* Returns 1 if character is a word character. |
|
|
* |
|
|
* Side effects: |
|
|
* None. |
|
|
* |
|
|
*---------------------------------------------------------------------- |
|
|
*/ |
|
|
|
|
|
int |
|
|
Tcl_UniCharIsWordChar(ch) |
|
|
int ch; /* Unicode character to test. */ |
|
|
{ |
|
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
|
|
|
|
|
return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); |
|
|
} |
|
|
|
|
|
|
|
|
/* $History: tclutf.c $ |
|
|
* |
|
|
* ***************** Version 1 ***************** |
|
|
* User: Dtashley Date: 1/02/01 Time: 1:05a |
|
|
* Created in $/IjuScripter, IjuConsole/Source/Tcl Base |
|
|
* Initial check-in. |
|
|
*/ |
|
|
|
|
|
/* End of TCL_UTF.C */ |
|
1 |
|
/* $Header$ */ |
2 |
|
/* |
3 |
|
* tclUtf.c -- |
4 |
|
* |
5 |
|
* Routines for manipulating UTF-8 strings. |
6 |
|
* |
7 |
|
* Copyright (c) 1997-1998 Sun Microsystems, Inc. |
8 |
|
* |
9 |
|
* See the file "license.terms" for information on usage and redistribution |
10 |
|
* of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
11 |
|
* |
12 |
|
* RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $ |
13 |
|
*/ |
14 |
|
|
15 |
|
#include "tclInt.h" |
16 |
|
|
17 |
|
/* |
18 |
|
* Include the static character classification tables and macros. |
19 |
|
*/ |
20 |
|
|
21 |
|
#include "tclUniData.c" |
22 |
|
|
23 |
|
/* |
24 |
|
* The following macros are used for fast character category tests. The |
25 |
|
* x_BITS values are shifted right by the category value to determine whether |
26 |
|
* the given category is included in the set. |
27 |
|
*/ |
28 |
|
|
29 |
|
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ |
30 |
|
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) |
31 |
|
|
32 |
|
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) |
33 |
|
|
34 |
|
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ |
35 |
|
| (1 << PARAGRAPH_SEPARATOR)) |
36 |
|
|
37 |
|
#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) |
38 |
|
|
39 |
|
#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ |
40 |
|
(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ |
41 |
|
(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ |
42 |
|
(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ |
43 |
|
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ |
44 |
|
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ |
45 |
|
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ |
46 |
|
(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ |
47 |
|
(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) |
48 |
|
|
49 |
|
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ |
50 |
|
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ |
51 |
|
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ |
52 |
|
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) |
53 |
|
|
54 |
|
/* |
55 |
|
* Unicode characters less than this value are represented by themselves |
56 |
|
* in UTF-8 strings. |
57 |
|
*/ |
58 |
|
|
59 |
|
#define UNICODE_SELF 0x80 |
60 |
|
|
61 |
|
/* |
62 |
|
* The following structures are used when mapping between Unicode (UCS-2) |
63 |
|
* and UTF-8. |
64 |
|
*/ |
65 |
|
|
66 |
|
CONST unsigned char totalBytes[256] = { |
67 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
68 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
69 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
70 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
71 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
72 |
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
73 |
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
74 |
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, |
75 |
|
#if TCL_UTF_MAX > 3 |
76 |
|
4,4,4,4,4,4,4,4, |
77 |
|
#else |
78 |
|
1,1,1,1,1,1,1,1, |
79 |
|
#endif |
80 |
|
#if TCL_UTF_MAX > 4 |
81 |
|
5,5,5,5, |
82 |
|
#else |
83 |
|
1,1,1,1, |
84 |
|
#endif |
85 |
|
#if TCL_UTF_MAX > 5 |
86 |
|
6,6,6,6 |
87 |
|
#else |
88 |
|
1,1,1,1 |
89 |
|
#endif |
90 |
|
}; |
91 |
|
|
92 |
|
/* |
93 |
|
* Procedures used only in this module. |
94 |
|
*/ |
95 |
|
|
96 |
|
static int UtfCount _ANSI_ARGS_((int ch)); |
97 |
|
|
98 |
|
|
99 |
|
/* |
100 |
|
*--------------------------------------------------------------------------- |
101 |
|
* |
102 |
|
* UtfCount -- |
103 |
|
* |
104 |
|
* Find the number of bytes in the Utf character "ch". |
105 |
|
* |
106 |
|
* Results: |
107 |
|
* The return values is the number of bytes in the Utf character "ch". |
108 |
|
* |
109 |
|
* Side effects: |
110 |
|
* None. |
111 |
|
* |
112 |
|
*--------------------------------------------------------------------------- |
113 |
|
*/ |
114 |
|
|
115 |
|
static int |
116 |
|
UtfCount(ch) |
117 |
|
int ch; /* The Tcl_UniChar whose size is returned. */ |
118 |
|
{ |
119 |
|
if ((ch > 0) && (ch < UNICODE_SELF)) { |
120 |
|
return 1; |
121 |
|
} |
122 |
|
if (ch <= 0x7FF) { |
123 |
|
return 2; |
124 |
|
} |
125 |
|
if (ch <= 0xFFFF) { |
126 |
|
return 3; |
127 |
|
} |
128 |
|
#if TCL_UTF_MAX > 3 |
129 |
|
if (ch <= 0x1FFFFF) { |
130 |
|
return 4; |
131 |
|
} |
132 |
|
if (ch <= 0x3FFFFFF) { |
133 |
|
return 5; |
134 |
|
} |
135 |
|
if (ch <= 0x7FFFFFFF) { |
136 |
|
return 6; |
137 |
|
} |
138 |
|
#endif |
139 |
|
return 3; |
140 |
|
} |
141 |
|
|
142 |
|
/* |
143 |
|
*--------------------------------------------------------------------------- |
144 |
|
* |
145 |
|
* Tcl_UniCharToUtf -- |
146 |
|
* |
147 |
|
* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the |
148 |
|
* provided buffer. Equivalent to Plan 9 runetochar(). |
149 |
|
* |
150 |
|
* Results: |
151 |
|
* The return values is the number of bytes in the buffer that |
152 |
|
* were consumed. |
153 |
|
* |
154 |
|
* Side effects: |
155 |
|
* None. |
156 |
|
* |
157 |
|
*--------------------------------------------------------------------------- |
158 |
|
*/ |
159 |
|
|
160 |
|
INLINE int |
161 |
|
Tcl_UniCharToUtf(ch, str) |
162 |
|
int ch; /* The Tcl_UniChar to be stored in the |
163 |
|
* buffer. */ |
164 |
|
char *str; /* Buffer in which the UTF-8 representation |
165 |
|
* of the Tcl_UniChar is stored. Buffer must |
166 |
|
* be large enough to hold the UTF-8 character |
167 |
|
* (at most TCL_UTF_MAX bytes). */ |
168 |
|
{ |
169 |
|
if ((ch > 0) && (ch < UNICODE_SELF)) { |
170 |
|
str[0] = (char) ch; |
171 |
|
return 1; |
172 |
|
} |
173 |
|
if (ch <= 0x7FF) { |
174 |
|
str[1] = (char) ((ch | 0x80) & 0xBF); |
175 |
|
str[0] = (char) ((ch >> 6) | 0xC0); |
176 |
|
return 2; |
177 |
|
} |
178 |
|
if (ch <= 0xFFFF) { |
179 |
|
three: |
180 |
|
str[2] = (char) ((ch | 0x80) & 0xBF); |
181 |
|
str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); |
182 |
|
str[0] = (char) ((ch >> 12) | 0xE0); |
183 |
|
return 3; |
184 |
|
} |
185 |
|
|
186 |
|
#if TCL_UTF_MAX > 3 |
187 |
|
if (ch <= 0x1FFFFF) { |
188 |
|
str[3] = (char) ((ch | 0x80) & 0xBF); |
189 |
|
str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); |
190 |
|
str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); |
191 |
|
str[0] = (char) ((ch >> 18) | 0xF0); |
192 |
|
return 4; |
193 |
|
} |
194 |
|
if (ch <= 0x3FFFFFF) { |
195 |
|
str[4] = (char) ((ch | 0x80) & 0xBF); |
196 |
|
str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); |
197 |
|
str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); |
198 |
|
str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); |
199 |
|
str[0] = (char) ((ch >> 24) | 0xF8); |
200 |
|
return 5; |
201 |
|
} |
202 |
|
if (ch <= 0x7FFFFFFF) { |
203 |
|
str[5] = (char) ((ch | 0x80) & 0xBF); |
204 |
|
str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); |
205 |
|
str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); |
206 |
|
str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); |
207 |
|
str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); |
208 |
|
str[0] = (char) ((ch >> 30) | 0xFC); |
209 |
|
return 6; |
210 |
|
} |
211 |
|
#endif |
212 |
|
|
213 |
|
ch = 0xFFFD; |
214 |
|
goto three; |
215 |
|
} |
216 |
|
|
217 |
|
/* |
218 |
|
*--------------------------------------------------------------------------- |
219 |
|
* |
220 |
|
* Tcl_UniCharToUtfDString -- |
221 |
|
* |
222 |
|
* Convert the given Unicode string to UTF-8. |
223 |
|
* |
224 |
|
* Results: |
225 |
|
* The return value is a pointer to the UTF-8 representation of the |
226 |
|
* Unicode string. Storage for the return value is appended to the |
227 |
|
* end of dsPtr. |
228 |
|
* |
229 |
|
* Side effects: |
230 |
|
* None. |
231 |
|
* |
232 |
|
*--------------------------------------------------------------------------- |
233 |
|
*/ |
234 |
|
|
235 |
|
char * |
236 |
|
Tcl_UniCharToUtfDString(wString, numChars, dsPtr) |
237 |
|
CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ |
238 |
|
int numChars; /* Length of Unicode string in Tcl_UniChars |
239 |
|
* (must be >= 0). */ |
240 |
|
Tcl_DString *dsPtr; /* UTF-8 representation of string is |
241 |
|
* appended to this previously initialized |
242 |
|
* DString. */ |
243 |
|
{ |
244 |
|
CONST Tcl_UniChar *w, *wEnd; |
245 |
|
char *p, *string; |
246 |
|
int oldLength; |
247 |
|
|
248 |
|
/* |
249 |
|
* UTF-8 string length in bytes will be <= Unicode string length * |
250 |
|
* TCL_UTF_MAX. |
251 |
|
*/ |
252 |
|
|
253 |
|
oldLength = Tcl_DStringLength(dsPtr); |
254 |
|
Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); |
255 |
|
string = Tcl_DStringValue(dsPtr) + oldLength; |
256 |
|
|
257 |
|
p = string; |
258 |
|
wEnd = wString + numChars; |
259 |
|
for (w = wString; w < wEnd; ) { |
260 |
|
p += Tcl_UniCharToUtf(*w, p); |
261 |
|
w++; |
262 |
|
} |
263 |
|
Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); |
264 |
|
|
265 |
|
return string; |
266 |
|
} |
267 |
|
|
268 |
|
/* |
269 |
|
*--------------------------------------------------------------------------- |
270 |
|
* |
271 |
|
* Tcl_UtfToUniChar -- |
272 |
|
* |
273 |
|
* Extract the Tcl_UniChar represented by the UTF-8 string. Bad |
274 |
|
* UTF-8 sequences are converted to valid Tcl_UniChars and processing |
275 |
|
* continues. Equivalent to Plan 9 chartorune(). |
276 |
|
* |
277 |
|
* The caller must ensure that the source buffer is long enough that |
278 |
|
* this routine does not run off the end and dereference non-existent |
279 |
|
* memory looking for trail bytes. If the source buffer is known to |
280 |
|
* be '\0' terminated, this cannot happen. Otherwise, the caller |
281 |
|
* should call Tcl_UtfCharComplete() before calling this routine to |
282 |
|
* ensure that enough bytes remain in the string. |
283 |
|
* |
284 |
|
* Results: |
285 |
|
* *chPtr is filled with the Tcl_UniChar, and the return value is the |
286 |
|
* number of bytes from the UTF-8 string that were consumed. |
287 |
|
* |
288 |
|
* Side effects: |
289 |
|
* None. |
290 |
|
* |
291 |
|
*--------------------------------------------------------------------------- |
292 |
|
*/ |
293 |
|
|
294 |
|
int |
295 |
|
Tcl_UtfToUniChar(str, chPtr) |
296 |
|
register CONST char *str; /* The UTF-8 string. */ |
297 |
|
register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented |
298 |
|
* by the UTF-8 string. */ |
299 |
|
{ |
300 |
|
register int byte; |
301 |
|
|
302 |
|
/* |
303 |
|
* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. |
304 |
|
*/ |
305 |
|
|
306 |
|
byte = *((unsigned char *) str); |
307 |
|
if (byte < 0xC0) { |
308 |
|
/* |
309 |
|
* Handles properly formed UTF-8 characters between 0x01 and 0x7F. |
310 |
|
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid |
311 |
|
* characters representing themselves. |
312 |
|
*/ |
313 |
|
|
314 |
|
*chPtr = (Tcl_UniChar) byte; |
315 |
|
return 1; |
316 |
|
} else if (byte < 0xE0) { |
317 |
|
if ((str[1] & 0xC0) == 0x80) { |
318 |
|
/* |
319 |
|
* Two-byte-character lead-byte followed by a trail-byte. |
320 |
|
*/ |
321 |
|
|
322 |
|
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); |
323 |
|
return 2; |
324 |
|
} |
325 |
|
/* |
326 |
|
* A two-byte-character lead-byte not followed by trail-byte |
327 |
|
* represents itself. |
328 |
|
*/ |
329 |
|
|
330 |
|
*chPtr = (Tcl_UniChar) byte; |
331 |
|
return 1; |
332 |
|
} else if (byte < 0xF0) { |
333 |
|
if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { |
334 |
|
/* |
335 |
|
* Three-byte-character lead byte followed by two trail bytes. |
336 |
|
*/ |
337 |
|
|
338 |
|
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) |
339 |
|
| ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); |
340 |
|
return 3; |
341 |
|
} |
342 |
|
/* |
343 |
|
* A three-byte-character lead-byte not followed by two trail-bytes |
344 |
|
* represents itself. |
345 |
|
*/ |
346 |
|
|
347 |
|
*chPtr = (Tcl_UniChar) byte; |
348 |
|
return 1; |
349 |
|
} |
350 |
|
#if TCL_UTF_MAX > 3 |
351 |
|
else { |
352 |
|
int ch, total, trail; |
353 |
|
|
354 |
|
total = totalBytes[byte]; |
355 |
|
trail = total - 1; |
356 |
|
if (trail > 0) { |
357 |
|
ch = byte & (0x3F >> trail); |
358 |
|
do { |
359 |
|
str++; |
360 |
|
if ((*str & 0xC0) != 0x80) { |
361 |
|
*chPtr = byte; |
362 |
|
return 1; |
363 |
|
} |
364 |
|
ch <<= 6; |
365 |
|
ch |= (*str & 0x3F); |
366 |
|
trail--; |
367 |
|
} while (trail > 0); |
368 |
|
*chPtr = ch; |
369 |
|
return total; |
370 |
|
} |
371 |
|
} |
372 |
|
#endif |
373 |
|
|
374 |
|
*chPtr = (Tcl_UniChar) byte; |
375 |
|
return 1; |
376 |
|
} |
377 |
|
|
378 |
|
/* |
379 |
|
*--------------------------------------------------------------------------- |
380 |
|
* |
381 |
|
* Tcl_UtfToUniCharDString -- |
382 |
|
* |
383 |
|
* Convert the UTF-8 string to Unicode. |
384 |
|
* |
385 |
|
* Results: |
386 |
|
* The return value is a pointer to the Unicode representation of the |
387 |
|
* UTF-8 string. Storage for the return value is appended to the |
388 |
|
* end of dsPtr. The Unicode string is terminated with a Unicode |
389 |
|
* NULL character. |
390 |
|
* |
391 |
|
* Side effects: |
392 |
|
* None. |
393 |
|
* |
394 |
|
*--------------------------------------------------------------------------- |
395 |
|
*/ |
396 |
|
|
397 |
|
Tcl_UniChar * |
398 |
|
Tcl_UtfToUniCharDString(string, length, dsPtr) |
399 |
|
CONST char *string; /* UTF-8 string to convert to Unicode. */ |
400 |
|
int length; /* Length of UTF-8 string in bytes, or -1 |
401 |
|
* for strlen(). */ |
402 |
|
Tcl_DString *dsPtr; /* Unicode representation of string is |
403 |
|
* appended to this previously initialized |
404 |
|
* DString. */ |
405 |
|
{ |
406 |
|
Tcl_UniChar *w, *wString; |
407 |
|
CONST char *p, *end; |
408 |
|
int oldLength; |
409 |
|
|
410 |
|
if (length < 0) { |
411 |
|
length = strlen(string); |
412 |
|
} |
413 |
|
|
414 |
|
/* |
415 |
|
* Unicode string length in Tcl_UniChars will be <= UTF-8 string length |
416 |
|
* in bytes. |
417 |
|
*/ |
418 |
|
|
419 |
|
oldLength = Tcl_DStringLength(dsPtr); |
420 |
|
Tcl_DStringSetLength(dsPtr, |
421 |
|
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); |
422 |
|
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); |
423 |
|
|
424 |
|
w = wString; |
425 |
|
end = string + length; |
426 |
|
for (p = string; p < end; ) { |
427 |
|
p += Tcl_UtfToUniChar(p, w); |
428 |
|
w++; |
429 |
|
} |
430 |
|
*w = '\0'; |
431 |
|
Tcl_DStringSetLength(dsPtr, |
432 |
|
(oldLength + ((char *) w - (char *) wString))); |
433 |
|
|
434 |
|
return wString; |
435 |
|
} |
436 |
|
|
437 |
|
/* |
438 |
|
*--------------------------------------------------------------------------- |
439 |
|
* |
440 |
|
* Tcl_UtfCharComplete -- |
441 |
|
* |
442 |
|
* Determine if the UTF-8 string of the given length is long enough |
443 |
|
* to be decoded by Tcl_UtfToUniChar(). This does not ensure that the |
444 |
|
* UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). |
445 |
|
* |
446 |
|
* Results: |
447 |
|
* The return value is 0 if the string is not long enough, non-zero |
448 |
|
* otherwise. |
449 |
|
* |
450 |
|
* Side effects: |
451 |
|
* None. |
452 |
|
* |
453 |
|
*--------------------------------------------------------------------------- |
454 |
|
*/ |
455 |
|
|
456 |
|
int |
457 |
|
Tcl_UtfCharComplete(str, len) |
458 |
|
CONST char *str; /* String to check if first few bytes |
459 |
|
* contain a complete UTF-8 character. */ |
460 |
|
int len; /* Length of above string in bytes. */ |
461 |
|
{ |
462 |
|
int ch; |
463 |
|
|
464 |
|
ch = *((unsigned char *) str); |
465 |
|
return len >= totalBytes[ch]; |
466 |
|
} |
467 |
|
|
468 |
|
/* |
469 |
|
*--------------------------------------------------------------------------- |
470 |
|
* |
471 |
|
* Tcl_NumUtfChars -- |
472 |
|
* |
473 |
|
* Returns the number of characters (not bytes) in the UTF-8 string, |
474 |
|
* not including the terminating NULL byte. This is equivalent to |
475 |
|
* Plan 9 utflen() and utfnlen(). |
476 |
|
* |
477 |
|
* Results: |
478 |
|
* As above. |
479 |
|
* |
480 |
|
* Side effects: |
481 |
|
* None. |
482 |
|
* |
483 |
|
*--------------------------------------------------------------------------- |
484 |
|
*/ |
485 |
|
|
486 |
|
int |
487 |
|
Tcl_NumUtfChars(str, len) |
488 |
|
register CONST char *str; /* The UTF-8 string to measure. */ |
489 |
|
int len; /* The length of the string in bytes, or -1 |
490 |
|
* for strlen(string). */ |
491 |
|
{ |
492 |
|
Tcl_UniChar ch; |
493 |
|
register Tcl_UniChar *chPtr = &ch; |
494 |
|
register int n; |
495 |
|
int i; |
496 |
|
|
497 |
|
/* |
498 |
|
* The separate implementations are faster. |
499 |
|
*/ |
500 |
|
|
501 |
|
i = 0; |
502 |
|
if (len < 0) { |
503 |
|
while (1) { |
504 |
|
str += Tcl_UtfToUniChar(str, chPtr); |
505 |
|
if (ch == '\0') { |
506 |
|
break; |
507 |
|
} |
508 |
|
i++; |
509 |
|
} |
510 |
|
} else { |
511 |
|
while (len > 0) { |
512 |
|
n = Tcl_UtfToUniChar(str, chPtr); |
513 |
|
len -= n; |
514 |
|
str += n; |
515 |
|
i++; |
516 |
|
} |
517 |
|
} |
518 |
|
return i; |
519 |
|
} |
520 |
|
|
521 |
|
/* |
522 |
|
*--------------------------------------------------------------------------- |
523 |
|
* |
524 |
|
* Tcl_UtfFindFirst -- |
525 |
|
* |
526 |
|
* Returns a pointer to the first occurance of the given Tcl_UniChar |
527 |
|
* in the NULL-terminated UTF-8 string. The NULL terminator is |
528 |
|
* considered part of the UTF-8 string. Equivalent to Plan 9 |
529 |
|
* utfrune(). |
530 |
|
* |
531 |
|
* Results: |
532 |
|
* As above. If the Tcl_UniChar does not exist in the given string, |
533 |
|
* the return value is NULL. |
534 |
|
* |
535 |
|
* Side effects: |
536 |
|
* None. |
537 |
|
* |
538 |
|
*--------------------------------------------------------------------------- |
539 |
|
*/ |
540 |
|
char * |
541 |
|
Tcl_UtfFindFirst(string, ch) |
542 |
|
CONST char *string; /* The UTF-8 string to be searched. */ |
543 |
|
int ch; /* The Tcl_UniChar to search for. */ |
544 |
|
{ |
545 |
|
int len; |
546 |
|
Tcl_UniChar find; |
547 |
|
|
548 |
|
while (1) { |
549 |
|
len = Tcl_UtfToUniChar(string, &find); |
550 |
|
if (find == ch) { |
551 |
|
return (char *) string; |
552 |
|
} |
553 |
|
if (*string == '\0') { |
554 |
|
return NULL; |
555 |
|
} |
556 |
|
string += len; |
557 |
|
} |
558 |
|
} |
559 |
|
|
560 |
|
/* |
561 |
|
*--------------------------------------------------------------------------- |
562 |
|
* |
563 |
|
* Tcl_UtfFindLast -- |
564 |
|
* |
565 |
|
* Returns a pointer to the last occurance of the given Tcl_UniChar |
566 |
|
* in the NULL-terminated UTF-8 string. The NULL terminator is |
567 |
|
* considered part of the UTF-8 string. Equivalent to Plan 9 |
568 |
|
* utfrrune(). |
569 |
|
* |
570 |
|
* Results: |
571 |
|
* As above. If the Tcl_UniChar does not exist in the given string, |
572 |
|
* the return value is NULL. |
573 |
|
* |
574 |
|
* Side effects: |
575 |
|
* None. |
576 |
|
* |
577 |
|
*--------------------------------------------------------------------------- |
578 |
|
*/ |
579 |
|
|
580 |
|
char * |
581 |
|
Tcl_UtfFindLast(string, ch) |
582 |
|
CONST char *string; /* The UTF-8 string to be searched. */ |
583 |
|
int ch; /* The Tcl_UniChar to search for. */ |
584 |
|
{ |
585 |
|
int len; |
586 |
|
Tcl_UniChar find; |
587 |
|
CONST char *last; |
588 |
|
|
589 |
|
last = NULL; |
590 |
|
while (1) { |
591 |
|
len = Tcl_UtfToUniChar(string, &find); |
592 |
|
if (find == ch) { |
593 |
|
last = string; |
594 |
|
} |
595 |
|
if (*string == '\0') { |
596 |
|
break; |
597 |
|
} |
598 |
|
string += len; |
599 |
|
} |
600 |
|
return (char *) last; |
601 |
|
} |
602 |
|
|
603 |
|
/* |
604 |
|
*--------------------------------------------------------------------------- |
605 |
|
* |
606 |
|
* Tcl_UtfNext -- |
607 |
|
* |
608 |
|
* Given a pointer to some current location in a UTF-8 string, |
609 |
|
* move forward one character. The caller must ensure that they |
610 |
|
* are not asking for the next character after the last character |
611 |
|
* in the string. |
612 |
|
* |
613 |
|
* Results: |
614 |
|
* The return value is the pointer to the next character in |
615 |
|
* the UTF-8 string. |
616 |
|
* |
617 |
|
* Side effects: |
618 |
|
* None. |
619 |
|
* |
620 |
|
*--------------------------------------------------------------------------- |
621 |
|
*/ |
622 |
|
|
623 |
|
char * |
624 |
|
Tcl_UtfNext(str) |
625 |
|
CONST char *str; /* The current location in the string. */ |
626 |
|
{ |
627 |
|
Tcl_UniChar ch; |
628 |
|
|
629 |
|
return (char *) str + Tcl_UtfToUniChar(str, &ch); |
630 |
|
} |
631 |
|
|
632 |
|
/* |
633 |
|
*--------------------------------------------------------------------------- |
634 |
|
* |
635 |
|
* Tcl_UtfPrev -- |
636 |
|
* |
637 |
|
* Given a pointer to some current location in a UTF-8 string, |
638 |
|
* move backwards one character. |
639 |
|
* |
640 |
|
* Results: |
641 |
|
* The return value is a pointer to the previous character in the |
642 |
|
* UTF-8 string. If the current location was already at the |
643 |
|
* beginning of the string, the return value will also be a |
644 |
|
* pointer to the beginning of the string. |
645 |
|
* |
646 |
|
* Side effects: |
647 |
|
* None. |
648 |
|
* |
649 |
|
*--------------------------------------------------------------------------- |
650 |
|
*/ |
651 |
|
|
652 |
|
char * |
653 |
|
Tcl_UtfPrev(str, start) |
654 |
|
CONST char *str; /* The current location in the string. */ |
655 |
|
CONST char *start; /* Pointer to the beginning of the |
656 |
|
* string, to avoid going backwards too |
657 |
|
* far. */ |
658 |
|
{ |
659 |
|
CONST char *look; |
660 |
|
int i, byte; |
661 |
|
|
662 |
|
str--; |
663 |
|
look = str; |
664 |
|
for (i = 0; i < TCL_UTF_MAX; i++) { |
665 |
|
if (look < start) { |
666 |
|
if (str < start) { |
667 |
|
str = start; |
668 |
|
} |
669 |
|
break; |
670 |
|
} |
671 |
|
byte = *((unsigned char *) look); |
672 |
|
if (byte < 0x80) { |
673 |
|
break; |
674 |
|
} |
675 |
|
if (byte >= 0xC0) { |
676 |
|
if (totalBytes[byte] != i + 1) { |
677 |
|
break; |
678 |
|
} |
679 |
|
return (char *) look; |
680 |
|
} |
681 |
|
look--; |
682 |
|
} |
683 |
|
return (char *) str; |
684 |
|
} |
685 |
|
|
686 |
|
/* |
687 |
|
*--------------------------------------------------------------------------- |
688 |
|
* |
689 |
|
* Tcl_UniCharAtIndex -- |
690 |
|
* |
691 |
|
* Returns the Unicode character represented at the specified |
692 |
|
* character (not byte) position in the UTF-8 string. |
693 |
|
* |
694 |
|
* Results: |
695 |
|
* As above. |
696 |
|
* |
697 |
|
* Side effects: |
698 |
|
* None. |
699 |
|
* |
700 |
|
*--------------------------------------------------------------------------- |
701 |
|
*/ |
702 |
|
|
703 |
|
Tcl_UniChar |
704 |
|
Tcl_UniCharAtIndex(src, index) |
705 |
|
register CONST char *src; /* The UTF-8 string to dereference. */ |
706 |
|
register int index; /* The position of the desired character. */ |
707 |
|
{ |
708 |
|
Tcl_UniChar ch; |
709 |
|
|
710 |
|
while (index >= 0) { |
711 |
|
index--; |
712 |
|
src += Tcl_UtfToUniChar(src, &ch); |
713 |
|
} |
714 |
|
return ch; |
715 |
|
} |
716 |
|
|
717 |
|
/* |
718 |
|
*--------------------------------------------------------------------------- |
719 |
|
* |
720 |
|
* Tcl_UtfAtIndex -- |
721 |
|
* |
722 |
|
* Returns a pointer to the specified character (not byte) position |
723 |
|
* in the UTF-8 string. |
724 |
|
* |
725 |
|
* Results: |
726 |
|
* As above. |
727 |
|
* |
728 |
|
* Side effects: |
729 |
|
* None. |
730 |
|
* |
731 |
|
*--------------------------------------------------------------------------- |
732 |
|
*/ |
733 |
|
|
734 |
|
char * |
735 |
|
Tcl_UtfAtIndex(src, index) |
736 |
|
register CONST char *src; /* The UTF-8 string. */ |
737 |
|
register int index; /* The position of the desired character. */ |
738 |
|
{ |
739 |
|
Tcl_UniChar ch; |
740 |
|
|
741 |
|
while (index > 0) { |
742 |
|
index--; |
743 |
|
src += Tcl_UtfToUniChar(src, &ch); |
744 |
|
} |
745 |
|
return (char *) src; |
746 |
|
} |
747 |
|
|
748 |
|
/* |
749 |
|
*--------------------------------------------------------------------------- |
750 |
|
* |
751 |
|
* Tcl_UtfBackslash -- |
752 |
|
* |
753 |
|
* Figure out how to handle a backslash sequence. |
754 |
|
* |
755 |
|
* Results: |
756 |
|
* Stores the bytes represented by the backslash sequence in dst and |
757 |
|
* returns the number of bytes written to dst. At most TCL_UTF_MAX |
758 |
|
* bytes are written to dst; dst must have been large enough to accept |
759 |
|
* those bytes. If readPtr isn't NULL then it is filled in with a |
760 |
|
* count of the number of bytes in the backslash sequence. |
761 |
|
* |
762 |
|
* Side effects: |
763 |
|
* The maximum number of bytes it takes to represent a Unicode |
764 |
|
* character in UTF-8 is guaranteed to be less than the number of |
765 |
|
* bytes used to express the backslash sequence that represents |
766 |
|
* that Unicode character. If the target buffer into which the |
767 |
|
* caller is going to store the bytes that represent the Unicode |
768 |
|
* character is at least as large as the source buffer from which |
769 |
|
* the backslashed sequence was extracted, no buffer overruns should |
770 |
|
* occur. |
771 |
|
* |
772 |
|
*--------------------------------------------------------------------------- |
773 |
|
*/ |
774 |
|
|
775 |
|
int |
776 |
|
Tcl_UtfBackslash(src, readPtr, dst) |
777 |
|
CONST char *src; /* Points to the backslash character of |
778 |
|
* a backslash sequence. */ |
779 |
|
int *readPtr; /* Fill in with number of characters read |
780 |
|
* from src, unless NULL. */ |
781 |
|
char *dst; /* Filled with the bytes represented by the |
782 |
|
* backslash sequence. */ |
783 |
|
{ |
784 |
|
register CONST char *p = src+1; |
785 |
|
int result, count, n; |
786 |
|
char buf[TCL_UTF_MAX]; |
787 |
|
|
788 |
|
if (dst == NULL) { |
789 |
|
dst = buf; |
790 |
|
} |
791 |
|
|
792 |
|
count = 2; |
793 |
|
switch (*p) { |
794 |
|
/* |
795 |
|
* Note: in the conversions below, use absolute values (e.g., |
796 |
|
* 0xa) rather than symbolic values (e.g. \n) that get converted |
797 |
|
* by the compiler. It's possible that compilers on some |
798 |
|
* platforms will do the symbolic conversions differently, which |
799 |
|
* could result in non-portable Tcl scripts. |
800 |
|
*/ |
801 |
|
|
802 |
|
case 'a': |
803 |
|
result = 0x7; |
804 |
|
break; |
805 |
|
case 'b': |
806 |
|
result = 0x8; |
807 |
|
break; |
808 |
|
case 'f': |
809 |
|
result = 0xc; |
810 |
|
break; |
811 |
|
case 'n': |
812 |
|
result = 0xa; |
813 |
|
break; |
814 |
|
case 'r': |
815 |
|
result = 0xd; |
816 |
|
break; |
817 |
|
case 't': |
818 |
|
result = 0x9; |
819 |
|
break; |
820 |
|
case 'v': |
821 |
|
result = 0xb; |
822 |
|
break; |
823 |
|
case 'x': |
824 |
|
if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ |
825 |
|
char *end; |
826 |
|
|
827 |
|
result = (unsigned char) strtoul(p+1, &end, 16); |
828 |
|
count = end - src; |
829 |
|
} else { |
830 |
|
count = 2; |
831 |
|
result = 'x'; |
832 |
|
} |
833 |
|
break; |
834 |
|
case 'u': |
835 |
|
result = 0; |
836 |
|
for (count = 0; count < 4; count++) { |
837 |
|
p++; |
838 |
|
if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ |
839 |
|
break; |
840 |
|
} |
841 |
|
n = *p - '0'; |
842 |
|
if (n > 9) { |
843 |
|
n = n + '0' + 10 - 'A'; |
844 |
|
} |
845 |
|
if (n > 16) { |
846 |
|
n = n + 'A' - 'a'; |
847 |
|
} |
848 |
|
result = (result << 4) + n; |
849 |
|
} |
850 |
|
if (count == 0) { |
851 |
|
result = 'u'; |
852 |
|
} |
853 |
|
count += 2; |
854 |
|
break; |
855 |
|
|
856 |
|
case '\n': |
857 |
|
do { |
858 |
|
p++; |
859 |
|
} while ((*p == ' ') || (*p == '\t')); |
860 |
|
result = ' '; |
861 |
|
count = p - src; |
862 |
|
break; |
863 |
|
case 0: |
864 |
|
result = '\\'; |
865 |
|
count = 1; |
866 |
|
break; |
867 |
|
default: |
868 |
|
/* |
869 |
|
* Check for an octal number \oo?o? |
870 |
|
*/ |
871 |
|
if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */ |
872 |
|
result = (unsigned char)(*p - '0'); |
873 |
|
p++; |
874 |
|
if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ |
875 |
|
break; |
876 |
|
} |
877 |
|
count = 3; |
878 |
|
result = (unsigned char)((result << 3) + (*p - '0')); |
879 |
|
p++; |
880 |
|
if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ |
881 |
|
break; |
882 |
|
} |
883 |
|
count = 4; |
884 |
|
result = (unsigned char)((result << 3) + (*p - '0')); |
885 |
|
break; |
886 |
|
} |
887 |
|
result = *p; |
888 |
|
count = 2; |
889 |
|
break; |
890 |
|
} |
891 |
|
|
892 |
|
if (readPtr != NULL) { |
893 |
|
*readPtr = count; |
894 |
|
} |
895 |
|
return Tcl_UniCharToUtf(result, dst); |
896 |
|
} |
897 |
|
|
898 |
|
/* |
899 |
|
*---------------------------------------------------------------------- |
900 |
|
* |
901 |
|
* Tcl_UtfToUpper -- |
902 |
|
* |
903 |
|
* Convert lowercase characters to uppercase characters in a UTF |
904 |
|
* string in place. The conversion may shrink the UTF string. |
905 |
|
* |
906 |
|
* Results: |
907 |
|
* Returns the number of bytes in the resulting string |
908 |
|
* excluding the trailing null. |
909 |
|
* |
910 |
|
* Side effects: |
911 |
|
* Writes a terminating null after the last converted character. |
912 |
|
* |
913 |
|
*---------------------------------------------------------------------- |
914 |
|
*/ |
915 |
|
|
916 |
|
int |
917 |
|
Tcl_UtfToUpper(str) |
918 |
|
char *str; /* String to convert in place. */ |
919 |
|
{ |
920 |
|
Tcl_UniChar ch, upChar; |
921 |
|
char *src, *dst; |
922 |
|
int bytes; |
923 |
|
|
924 |
|
/* |
925 |
|
* Iterate over the string until we hit the terminating null. |
926 |
|
*/ |
927 |
|
|
928 |
|
src = dst = str; |
929 |
|
while (*src) { |
930 |
|
bytes = Tcl_UtfToUniChar(src, &ch); |
931 |
|
upChar = Tcl_UniCharToUpper(ch); |
932 |
|
|
933 |
|
/* |
934 |
|
* To keep badly formed Utf strings from getting inflated by |
935 |
|
* the conversion (thereby causing a segfault), only copy the |
936 |
|
* upper case char to dst if its size is <= the original char. |
937 |
|
*/ |
938 |
|
|
939 |
|
if (bytes < UtfCount(upChar)) { |
940 |
|
memcpy(dst, src, (size_t) bytes); |
941 |
|
dst += bytes; |
942 |
|
} else { |
943 |
|
dst += Tcl_UniCharToUtf(upChar, dst); |
944 |
|
} |
945 |
|
src += bytes; |
946 |
|
} |
947 |
|
*dst = '\0'; |
948 |
|
return (dst - str); |
949 |
|
} |
950 |
|
|
951 |
|
/* |
952 |
|
*---------------------------------------------------------------------- |
953 |
|
* |
954 |
|
* Tcl_UtfToLower -- |
955 |
|
* |
956 |
|
* Convert uppercase characters to lowercase characters in a UTF |
957 |
|
* string in place. The conversion may shrink the UTF string. |
958 |
|
* |
959 |
|
* Results: |
960 |
|
* Returns the number of bytes in the resulting string |
961 |
|
* excluding the trailing null. |
962 |
|
* |
963 |
|
* Side effects: |
964 |
|
* Writes a terminating null after the last converted character. |
965 |
|
* |
966 |
|
*---------------------------------------------------------------------- |
967 |
|
*/ |
968 |
|
|
969 |
|
int |
970 |
|
Tcl_UtfToLower(str) |
971 |
|
char *str; /* String to convert in place. */ |
972 |
|
{ |
973 |
|
Tcl_UniChar ch, lowChar; |
974 |
|
char *src, *dst; |
975 |
|
int bytes; |
976 |
|
|
977 |
|
/* |
978 |
|
* Iterate over the string until we hit the terminating null. |
979 |
|
*/ |
980 |
|
|
981 |
|
src = dst = str; |
982 |
|
while (*src) { |
983 |
|
bytes = Tcl_UtfToUniChar(src, &ch); |
984 |
|
lowChar = Tcl_UniCharToLower(ch); |
985 |
|
|
986 |
|
/* |
987 |
|
* To keep badly formed Utf strings from getting inflated by |
988 |
|
* the conversion (thereby causing a segfault), only copy the |
989 |
|
* lower case char to dst if its size is <= the original char. |
990 |
|
*/ |
991 |
|
|
992 |
|
if (bytes < UtfCount(lowChar)) { |
993 |
|
memcpy(dst, src, (size_t) bytes); |
994 |
|
dst += bytes; |
995 |
|
} else { |
996 |
|
dst += Tcl_UniCharToUtf(lowChar, dst); |
997 |
|
} |
998 |
|
src += bytes; |
999 |
|
} |
1000 |
|
*dst = '\0'; |
1001 |
|
return (dst - str); |
1002 |
|
} |
1003 |
|
|
1004 |
|
/* |
1005 |
|
*---------------------------------------------------------------------- |
1006 |
|
* |
1007 |
|
* Tcl_UtfToTitle -- |
1008 |
|
* |
1009 |
|
* Changes the first character of a UTF string to title case or |
1010 |
|
* uppercase and the rest of the string to lowercase. The |
1011 |
|
* conversion happens in place and may shrink the UTF string. |
1012 |
|
* |
1013 |
|
* Results: |
1014 |
|
* Returns the number of bytes in the resulting string |
1015 |
|
* excluding the trailing null. |
1016 |
|
* |
1017 |
|
* Side effects: |
1018 |
|
* Writes a terminating null after the last converted character. |
1019 |
|
* |
1020 |
|
*---------------------------------------------------------------------- |
1021 |
|
*/ |
1022 |
|
|
1023 |
|
int |
1024 |
|
Tcl_UtfToTitle(str) |
1025 |
|
char *str; /* String to convert in place. */ |
1026 |
|
{ |
1027 |
|
Tcl_UniChar ch, titleChar, lowChar; |
1028 |
|
char *src, *dst; |
1029 |
|
int bytes; |
1030 |
|
|
1031 |
|
/* |
1032 |
|
* Capitalize the first character and then lowercase the rest of the |
1033 |
|
* characters until we get to a null. |
1034 |
|
*/ |
1035 |
|
|
1036 |
|
src = dst = str; |
1037 |
|
|
1038 |
|
if (*src) { |
1039 |
|
bytes = Tcl_UtfToUniChar(src, &ch); |
1040 |
|
titleChar = Tcl_UniCharToTitle(ch); |
1041 |
|
|
1042 |
|
if (bytes < UtfCount(titleChar)) { |
1043 |
|
memcpy(dst, src, (size_t) bytes); |
1044 |
|
dst += bytes; |
1045 |
|
} else { |
1046 |
|
dst += Tcl_UniCharToUtf(titleChar, dst); |
1047 |
|
} |
1048 |
|
src += bytes; |
1049 |
|
} |
1050 |
|
while (*src) { |
1051 |
|
bytes = Tcl_UtfToUniChar(src, &ch); |
1052 |
|
lowChar = Tcl_UniCharToLower(ch); |
1053 |
|
|
1054 |
|
if (bytes < UtfCount(lowChar)) { |
1055 |
|
memcpy(dst, src, (size_t) bytes); |
1056 |
|
dst += bytes; |
1057 |
|
} else { |
1058 |
|
dst += Tcl_UniCharToUtf(lowChar, dst); |
1059 |
|
} |
1060 |
|
src += bytes; |
1061 |
|
} |
1062 |
|
*dst = '\0'; |
1063 |
|
return (dst - str); |
1064 |
|
} |
1065 |
|
|
1066 |
|
/* |
1067 |
|
*---------------------------------------------------------------------- |
1068 |
|
* |
1069 |
|
* Tcl_UtfNcmp -- |
1070 |
|
* |
1071 |
|
* Compare at most n UTF chars of string cs to string ct. Both cs |
1072 |
|
* and ct are assumed to be at least n UTF chars long. |
1073 |
|
* |
1074 |
|
* Results: |
1075 |
|
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
1076 |
|
* |
1077 |
|
* Side effects: |
1078 |
|
* None. |
1079 |
|
* |
1080 |
|
*---------------------------------------------------------------------- |
1081 |
|
*/ |
1082 |
|
|
1083 |
|
int |
1084 |
|
Tcl_UtfNcmp(cs, ct, n) |
1085 |
|
CONST char *cs; /* UTF string to compare to ct. */ |
1086 |
|
CONST char *ct; /* UTF string cs is compared to. */ |
1087 |
|
unsigned long n; /* Number of UTF chars to compare. */ |
1088 |
|
{ |
1089 |
|
Tcl_UniChar ch1, ch2; |
1090 |
|
/* |
1091 |
|
* Another approach that should work is: |
1092 |
|
* return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs)); |
1093 |
|
* That assumes that ct is a properly formed UTF, so we will just |
1094 |
|
* be comparing the bytes that compromise those strings to the |
1095 |
|
* char length n. |
1096 |
|
*/ |
1097 |
|
while (n-- > 0) { |
1098 |
|
/* |
1099 |
|
* n must be interpreted as chars, not bytes. |
1100 |
|
* This should be called only when both strings are of |
1101 |
|
* at least n chars long (no need for \0 check) |
1102 |
|
*/ |
1103 |
|
cs += Tcl_UtfToUniChar(cs, &ch1); |
1104 |
|
ct += Tcl_UtfToUniChar(ct, &ch2); |
1105 |
|
if (ch1 != ch2) { |
1106 |
|
return (ch1 - ch2); |
1107 |
|
} |
1108 |
|
} |
1109 |
|
return 0; |
1110 |
|
} |
1111 |
|
|
1112 |
|
/* |
1113 |
|
*---------------------------------------------------------------------- |
1114 |
|
* |
1115 |
|
* Tcl_UtfNcasecmp -- |
1116 |
|
* |
1117 |
|
* Compare at most n UTF chars of string cs to string ct case |
1118 |
|
* insensitive. Both cs and ct are assumed to be at least n |
1119 |
|
* UTF chars long. |
1120 |
|
* |
1121 |
|
* Results: |
1122 |
|
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
1123 |
|
* |
1124 |
|
* Side effects: |
1125 |
|
* None. |
1126 |
|
* |
1127 |
|
*---------------------------------------------------------------------- |
1128 |
|
*/ |
1129 |
|
|
1130 |
|
int |
1131 |
|
Tcl_UtfNcasecmp(cs, ct, n) |
1132 |
|
CONST char *cs; /* UTF string to compare to ct. */ |
1133 |
|
CONST char *ct; /* UTF string cs is compared to. */ |
1134 |
|
unsigned long n; /* Number of UTF chars to compare. */ |
1135 |
|
{ |
1136 |
|
Tcl_UniChar ch1, ch2; |
1137 |
|
while (n-- > 0) { |
1138 |
|
/* |
1139 |
|
* n must be interpreted as chars, not bytes. |
1140 |
|
* This should be called only when both strings are of |
1141 |
|
* at least n chars long (no need for \0 check) |
1142 |
|
*/ |
1143 |
|
cs += Tcl_UtfToUniChar(cs, &ch1); |
1144 |
|
ct += Tcl_UtfToUniChar(ct, &ch2); |
1145 |
|
if (ch1 != ch2) { |
1146 |
|
ch1 = Tcl_UniCharToLower(ch1); |
1147 |
|
ch2 = Tcl_UniCharToLower(ch2); |
1148 |
|
if (ch1 != ch2) { |
1149 |
|
return (ch1 - ch2); |
1150 |
|
} |
1151 |
|
} |
1152 |
|
} |
1153 |
|
return 0; |
1154 |
|
} |
1155 |
|
|
1156 |
|
/* |
1157 |
|
*---------------------------------------------------------------------- |
1158 |
|
* |
1159 |
|
* Tcl_UniCharToUpper -- |
1160 |
|
* |
1161 |
|
* Compute the uppercase equivalent of the given Unicode character. |
1162 |
|
* |
1163 |
|
* Results: |
1164 |
|
* Returns the uppercase Unicode character. |
1165 |
|
* |
1166 |
|
* Side effects: |
1167 |
|
* None. |
1168 |
|
* |
1169 |
|
*---------------------------------------------------------------------- |
1170 |
|
*/ |
1171 |
|
|
1172 |
|
Tcl_UniChar |
1173 |
|
Tcl_UniCharToUpper(ch) |
1174 |
|
int ch; /* Unicode character to convert. */ |
1175 |
|
{ |
1176 |
|
int info = GetUniCharInfo(ch); |
1177 |
|
|
1178 |
|
if (GetCaseType(info) & 0x04) { |
1179 |
|
return (Tcl_UniChar) (ch - GetDelta(info)); |
1180 |
|
} else { |
1181 |
|
return ch; |
1182 |
|
} |
1183 |
|
} |
1184 |
|
|
1185 |
|
/* |
1186 |
|
*---------------------------------------------------------------------- |
1187 |
|
* |
1188 |
|
* Tcl_UniCharToLower -- |
1189 |
|
* |
1190 |
|
* Compute the lowercase equivalent of the given Unicode character. |
1191 |
|
* |
1192 |
|
* Results: |
1193 |
|
* Returns the lowercase Unicode character. |
1194 |
|
* |
1195 |
|
* Side effects: |
1196 |
|
* None. |
1197 |
|
* |
1198 |
|
*---------------------------------------------------------------------- |
1199 |
|
*/ |
1200 |
|
|
1201 |
|
Tcl_UniChar |
1202 |
|
Tcl_UniCharToLower(ch) |
1203 |
|
int ch; /* Unicode character to convert. */ |
1204 |
|
{ |
1205 |
|
int info = GetUniCharInfo(ch); |
1206 |
|
|
1207 |
|
if (GetCaseType(info) & 0x02) { |
1208 |
|
return (Tcl_UniChar) (ch + GetDelta(info)); |
1209 |
|
} else { |
1210 |
|
return ch; |
1211 |
|
} |
1212 |
|
} |
1213 |
|
|
1214 |
|
/* |
1215 |
|
*---------------------------------------------------------------------- |
1216 |
|
* |
1217 |
|
* Tcl_UniCharToTitle -- |
1218 |
|
* |
1219 |
|
* Compute the titlecase equivalent of the given Unicode character. |
1220 |
|
* |
1221 |
|
* Results: |
1222 |
|
* Returns the titlecase Unicode character. |
1223 |
|
* |
1224 |
|
* Side effects: |
1225 |
|
* None. |
1226 |
|
* |
1227 |
|
*---------------------------------------------------------------------- |
1228 |
|
*/ |
1229 |
|
|
1230 |
|
Tcl_UniChar |
1231 |
|
Tcl_UniCharToTitle(ch) |
1232 |
|
int ch; /* Unicode character to convert. */ |
1233 |
|
{ |
1234 |
|
int info = GetUniCharInfo(ch); |
1235 |
|
int mode = GetCaseType(info); |
1236 |
|
|
1237 |
|
if (mode & 0x1) { |
1238 |
|
/* |
1239 |
|
* Subtract or add one depending on the original case. |
1240 |
|
*/ |
1241 |
|
|
1242 |
|
return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); |
1243 |
|
} else if (mode == 0x4) { |
1244 |
|
return (Tcl_UniChar) (ch - GetDelta(info)); |
1245 |
|
} else { |
1246 |
|
return ch; |
1247 |
|
} |
1248 |
|
} |
1249 |
|
|
1250 |
|
/* |
1251 |
|
*---------------------------------------------------------------------- |
1252 |
|
* |
1253 |
|
* Tcl_UniCharLen -- |
1254 |
|
* |
1255 |
|
* Find the length of a UniChar string. The str input must be null |
1256 |
|
* terminated. |
1257 |
|
* |
1258 |
|
* Results: |
1259 |
|
* Returns the length of str in UniChars (not bytes). |
1260 |
|
* |
1261 |
|
* Side effects: |
1262 |
|
* None. |
1263 |
|
* |
1264 |
|
*---------------------------------------------------------------------- |
1265 |
|
*/ |
1266 |
|
|
1267 |
|
int |
1268 |
|
Tcl_UniCharLen(str) |
1269 |
|
Tcl_UniChar *str; /* Unicode string to find length of. */ |
1270 |
|
{ |
1271 |
|
int len = 0; |
1272 |
|
|
1273 |
|
while (*str != '\0') { |
1274 |
|
len++; |
1275 |
|
str++; |
1276 |
|
} |
1277 |
|
return len; |
1278 |
|
} |
1279 |
|
|
1280 |
|
/* |
1281 |
|
*---------------------------------------------------------------------- |
1282 |
|
* |
1283 |
|
* Tcl_UniCharNcmp -- |
1284 |
|
* |
1285 |
|
* Compare at most n unichars of string cs to string ct. Both cs |
1286 |
|
* and ct are assumed to be at least n unichars long. |
1287 |
|
* |
1288 |
|
* Results: |
1289 |
|
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
1290 |
|
* |
1291 |
|
* Side effects: |
1292 |
|
* None. |
1293 |
|
* |
1294 |
|
*---------------------------------------------------------------------- |
1295 |
|
*/ |
1296 |
|
|
1297 |
|
int |
1298 |
|
Tcl_UniCharNcmp(cs, ct, n) |
1299 |
|
CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ |
1300 |
|
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ |
1301 |
|
unsigned long n; /* Number of unichars to compare. */ |
1302 |
|
{ |
1303 |
|
for ( ; n != 0; n--, cs++, ct++) { |
1304 |
|
if (*cs != *ct) { |
1305 |
|
return *cs - *ct; |
1306 |
|
} |
1307 |
|
if (*cs == '\0') { |
1308 |
|
break; |
1309 |
|
} |
1310 |
|
} |
1311 |
|
return 0; |
1312 |
|
} |
1313 |
|
|
1314 |
|
/* |
1315 |
|
*---------------------------------------------------------------------- |
1316 |
|
* |
1317 |
|
* Tcl_UniCharIsAlnum -- |
1318 |
|
* |
1319 |
|
* Test if a character is an alphanumeric Unicode character. |
1320 |
|
* |
1321 |
|
* Results: |
1322 |
|
* Returns 1 if character is alphanumeric. |
1323 |
|
* |
1324 |
|
* Side effects: |
1325 |
|
* None. |
1326 |
|
* |
1327 |
|
*---------------------------------------------------------------------- |
1328 |
|
*/ |
1329 |
|
|
1330 |
|
int |
1331 |
|
Tcl_UniCharIsAlnum(ch) |
1332 |
|
int ch; /* Unicode character to test. */ |
1333 |
|
{ |
1334 |
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1335 |
|
|
1336 |
|
return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); |
1337 |
|
} |
1338 |
|
|
1339 |
|
/* |
1340 |
|
*---------------------------------------------------------------------- |
1341 |
|
* |
1342 |
|
* Tcl_UniCharIsAlpha -- |
1343 |
|
* |
1344 |
|
* Test if a character is an alphabetic Unicode character. |
1345 |
|
* |
1346 |
|
* Results: |
1347 |
|
* Returns 1 if character is alphabetic. |
1348 |
|
* |
1349 |
|
* Side effects: |
1350 |
|
* None. |
1351 |
|
* |
1352 |
|
*---------------------------------------------------------------------- |
1353 |
|
*/ |
1354 |
|
|
1355 |
|
int |
1356 |
|
Tcl_UniCharIsAlpha(ch) |
1357 |
|
int ch; /* Unicode character to test. */ |
1358 |
|
{ |
1359 |
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1360 |
|
return ((ALPHA_BITS >> category) & 1); |
1361 |
|
} |
1362 |
|
|
1363 |
|
/* |
1364 |
|
*---------------------------------------------------------------------- |
1365 |
|
* |
1366 |
|
* Tcl_UniCharIsControl -- |
1367 |
|
* |
1368 |
|
* Test if a character is a Unicode control character. |
1369 |
|
* |
1370 |
|
* Results: |
1371 |
|
* Returns non-zero if character is a control. |
1372 |
|
* |
1373 |
|
* Side effects: |
1374 |
|
* None. |
1375 |
|
* |
1376 |
|
*---------------------------------------------------------------------- |
1377 |
|
*/ |
1378 |
|
|
1379 |
|
int |
1380 |
|
Tcl_UniCharIsControl(ch) |
1381 |
|
int ch; /* Unicode character to test. */ |
1382 |
|
{ |
1383 |
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); |
1384 |
|
} |
1385 |
|
|
1386 |
|
/* |
1387 |
|
*---------------------------------------------------------------------- |
1388 |
|
* |
1389 |
|
* Tcl_UniCharIsDigit -- |
1390 |
|
* |
1391 |
|
* Test if a character is a numeric Unicode character. |
1392 |
|
* |
1393 |
|
* Results: |
1394 |
|
* Returns non-zero if character is a digit. |
1395 |
|
* |
1396 |
|
* Side effects: |
1397 |
|
* None. |
1398 |
|
* |
1399 |
|
*---------------------------------------------------------------------- |
1400 |
|
*/ |
1401 |
|
|
1402 |
|
int |
1403 |
|
Tcl_UniCharIsDigit(ch) |
1404 |
|
int ch; /* Unicode character to test. */ |
1405 |
|
{ |
1406 |
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) |
1407 |
|
== DECIMAL_DIGIT_NUMBER); |
1408 |
|
} |
1409 |
|
|
1410 |
|
/* |
1411 |
|
*---------------------------------------------------------------------- |
1412 |
|
* |
1413 |
|
* Tcl_UniCharIsGraph -- |
1414 |
|
* |
1415 |
|
* Test if a character is any Unicode print character except space. |
1416 |
|
* |
1417 |
|
* Results: |
1418 |
|
* Returns non-zero if character is printable, but not space. |
1419 |
|
* |
1420 |
|
* Side effects: |
1421 |
|
* None. |
1422 |
|
* |
1423 |
|
*---------------------------------------------------------------------- |
1424 |
|
*/ |
1425 |
|
|
1426 |
|
int |
1427 |
|
Tcl_UniCharIsGraph(ch) |
1428 |
|
int ch; /* Unicode character to test. */ |
1429 |
|
{ |
1430 |
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1431 |
|
return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); |
1432 |
|
} |
1433 |
|
|
1434 |
|
/* |
1435 |
|
*---------------------------------------------------------------------- |
1436 |
|
* |
1437 |
|
* Tcl_UniCharIsLower -- |
1438 |
|
* |
1439 |
|
* Test if a character is a lowercase Unicode character. |
1440 |
|
* |
1441 |
|
* Results: |
1442 |
|
* Returns non-zero if character is lowercase. |
1443 |
|
* |
1444 |
|
* Side effects: |
1445 |
|
* None. |
1446 |
|
* |
1447 |
|
*---------------------------------------------------------------------- |
1448 |
|
*/ |
1449 |
|
|
1450 |
|
int |
1451 |
|
Tcl_UniCharIsLower(ch) |
1452 |
|
int ch; /* Unicode character to test. */ |
1453 |
|
{ |
1454 |
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); |
1455 |
|
} |
1456 |
|
|
1457 |
|
/* |
1458 |
|
*---------------------------------------------------------------------- |
1459 |
|
* |
1460 |
|
* Tcl_UniCharIsPrint -- |
1461 |
|
* |
1462 |
|
* Test if a character is a Unicode print character. |
1463 |
|
* |
1464 |
|
* Results: |
1465 |
|
* Returns non-zero if character is printable. |
1466 |
|
* |
1467 |
|
* Side effects: |
1468 |
|
* None. |
1469 |
|
* |
1470 |
|
*---------------------------------------------------------------------- |
1471 |
|
*/ |
1472 |
|
|
1473 |
|
int |
1474 |
|
Tcl_UniCharIsPrint(ch) |
1475 |
|
int ch; /* Unicode character to test. */ |
1476 |
|
{ |
1477 |
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1478 |
|
return ((PRINT_BITS >> category) & 1); |
1479 |
|
} |
1480 |
|
|
1481 |
|
/* |
1482 |
|
*---------------------------------------------------------------------- |
1483 |
|
* |
1484 |
|
* Tcl_UniCharIsPunct -- |
1485 |
|
* |
1486 |
|
* Test if a character is a Unicode punctuation character. |
1487 |
|
* |
1488 |
|
* Results: |
1489 |
|
* Returns non-zero if character is punct. |
1490 |
|
* |
1491 |
|
* Side effects: |
1492 |
|
* None. |
1493 |
|
* |
1494 |
|
*---------------------------------------------------------------------- |
1495 |
|
*/ |
1496 |
|
|
1497 |
|
int |
1498 |
|
Tcl_UniCharIsPunct(ch) |
1499 |
|
int ch; /* Unicode character to test. */ |
1500 |
|
{ |
1501 |
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1502 |
|
return ((PUNCT_BITS >> category) & 1); |
1503 |
|
} |
1504 |
|
|
1505 |
|
/* |
1506 |
|
*---------------------------------------------------------------------- |
1507 |
|
* |
1508 |
|
* Tcl_UniCharIsSpace -- |
1509 |
|
* |
1510 |
|
* Test if a character is a whitespace Unicode character. |
1511 |
|
* |
1512 |
|
* Results: |
1513 |
|
* Returns non-zero if character is a space. |
1514 |
|
* |
1515 |
|
* Side effects: |
1516 |
|
* None. |
1517 |
|
* |
1518 |
|
*---------------------------------------------------------------------- |
1519 |
|
*/ |
1520 |
|
|
1521 |
|
int |
1522 |
|
Tcl_UniCharIsSpace(ch) |
1523 |
|
int ch; /* Unicode character to test. */ |
1524 |
|
{ |
1525 |
|
register int category; |
1526 |
|
|
1527 |
|
/* |
1528 |
|
* If the character is within the first 127 characters, just use the |
1529 |
|
* standard C function, otherwise consult the Unicode table. |
1530 |
|
*/ |
1531 |
|
|
1532 |
|
if (ch < 0x80) { |
1533 |
|
return isspace(UCHAR(ch)); /* INTL: ISO space */ |
1534 |
|
} else { |
1535 |
|
category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1536 |
|
return ((SPACE_BITS >> category) & 1); |
1537 |
|
} |
1538 |
|
} |
1539 |
|
|
1540 |
|
/* |
1541 |
|
*---------------------------------------------------------------------- |
1542 |
|
* |
1543 |
|
* Tcl_UniCharIsUpper -- |
1544 |
|
* |
1545 |
|
* Test if a character is a uppercase Unicode character. |
1546 |
|
* |
1547 |
|
* Results: |
1548 |
|
* Returns non-zero if character is uppercase. |
1549 |
|
* |
1550 |
|
* Side effects: |
1551 |
|
* None. |
1552 |
|
* |
1553 |
|
*---------------------------------------------------------------------- |
1554 |
|
*/ |
1555 |
|
|
1556 |
|
int |
1557 |
|
Tcl_UniCharIsUpper(ch) |
1558 |
|
int ch; /* Unicode character to test. */ |
1559 |
|
{ |
1560 |
|
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); |
1561 |
|
} |
1562 |
|
|
1563 |
|
/* |
1564 |
|
*---------------------------------------------------------------------- |
1565 |
|
* |
1566 |
|
* Tcl_UniCharIsWordChar -- |
1567 |
|
* |
1568 |
|
* Test if a character is alphanumeric or a connector punctuation |
1569 |
|
* mark. |
1570 |
|
* |
1571 |
|
* Results: |
1572 |
|
* Returns 1 if character is a word character. |
1573 |
|
* |
1574 |
|
* Side effects: |
1575 |
|
* None. |
1576 |
|
* |
1577 |
|
*---------------------------------------------------------------------- |
1578 |
|
*/ |
1579 |
|
|
1580 |
|
int |
1581 |
|
Tcl_UniCharIsWordChar(ch) |
1582 |
|
int ch; /* Unicode character to test. */ |
1583 |
|
{ |
1584 |
|
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1585 |
|
|
1586 |
|
return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); |
1587 |
|
} |
1588 |
|
|
1589 |
|
/* End of tclutf.c */ |