1 |
/* $Header$ */ |
2 |
/* |
3 |
* tclUtf.c -- |
4 |
* |
5 |
* Routines for manipulating UTF-8 strings. |
6 |
* |
7 |
* Copyright (c) 1997-1998 Sun Microsystems, Inc. |
8 |
* |
9 |
* See the file "license.terms" for information on usage and redistribution |
10 |
* of this file, and for a DISCLAIMER OF ALL WARRANTIES. |
11 |
* |
12 |
* RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $ |
13 |
*/ |
14 |
|
15 |
#include "tclInt.h" |
16 |
|
17 |
/* |
18 |
* Include the static character classification tables and macros. |
19 |
*/ |
20 |
|
21 |
#include "tclUniData.c" |
22 |
|
23 |
/* |
24 |
* The following macros are used for fast character category tests. The |
25 |
* x_BITS values are shifted right by the category value to determine whether |
26 |
* the given category is included in the set. |
27 |
*/ |
28 |
|
29 |
#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ |
30 |
| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER)) |
31 |
|
32 |
#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) |
33 |
|
34 |
#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ |
35 |
| (1 << PARAGRAPH_SEPARATOR)) |
36 |
|
37 |
#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) |
38 |
|
39 |
#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ |
40 |
(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ |
41 |
(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ |
42 |
(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ |
43 |
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ |
44 |
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ |
45 |
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ |
46 |
(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ |
47 |
(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) |
48 |
|
49 |
#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ |
50 |
(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ |
51 |
(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ |
52 |
(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) |
53 |
|
54 |
/* |
55 |
* Unicode characters less than this value are represented by themselves |
56 |
* in UTF-8 strings. |
57 |
*/ |
58 |
|
59 |
#define UNICODE_SELF 0x80 |
60 |
|
61 |
/* |
62 |
* The following structures are used when mapping between Unicode (UCS-2) |
63 |
* and UTF-8. |
64 |
*/ |
65 |
|
66 |
CONST unsigned char totalBytes[256] = { |
67 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
68 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
69 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
70 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
71 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
72 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
73 |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
74 |
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, |
75 |
#if TCL_UTF_MAX > 3 |
76 |
4,4,4,4,4,4,4,4, |
77 |
#else |
78 |
1,1,1,1,1,1,1,1, |
79 |
#endif |
80 |
#if TCL_UTF_MAX > 4 |
81 |
5,5,5,5, |
82 |
#else |
83 |
1,1,1,1, |
84 |
#endif |
85 |
#if TCL_UTF_MAX > 5 |
86 |
6,6,6,6 |
87 |
#else |
88 |
1,1,1,1 |
89 |
#endif |
90 |
}; |
91 |
|
92 |
/* |
93 |
* Procedures used only in this module. |
94 |
*/ |
95 |
|
96 |
static int UtfCount _ANSI_ARGS_((int ch)); |
97 |
|
98 |
|
99 |
/* |
100 |
*--------------------------------------------------------------------------- |
101 |
* |
102 |
* UtfCount -- |
103 |
* |
104 |
* Find the number of bytes in the Utf character "ch". |
105 |
* |
106 |
* Results: |
107 |
* The return values is the number of bytes in the Utf character "ch". |
108 |
* |
109 |
* Side effects: |
110 |
* None. |
111 |
* |
112 |
*--------------------------------------------------------------------------- |
113 |
*/ |
114 |
|
115 |
static int |
116 |
UtfCount(ch) |
117 |
int ch; /* The Tcl_UniChar whose size is returned. */ |
118 |
{ |
119 |
if ((ch > 0) && (ch < UNICODE_SELF)) { |
120 |
return 1; |
121 |
} |
122 |
if (ch <= 0x7FF) { |
123 |
return 2; |
124 |
} |
125 |
if (ch <= 0xFFFF) { |
126 |
return 3; |
127 |
} |
128 |
#if TCL_UTF_MAX > 3 |
129 |
if (ch <= 0x1FFFFF) { |
130 |
return 4; |
131 |
} |
132 |
if (ch <= 0x3FFFFFF) { |
133 |
return 5; |
134 |
} |
135 |
if (ch <= 0x7FFFFFFF) { |
136 |
return 6; |
137 |
} |
138 |
#endif |
139 |
return 3; |
140 |
} |
141 |
|
142 |
/* |
143 |
*--------------------------------------------------------------------------- |
144 |
* |
145 |
* Tcl_UniCharToUtf -- |
146 |
* |
147 |
* Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the |
148 |
* provided buffer. Equivalent to Plan 9 runetochar(). |
149 |
* |
150 |
* Results: |
151 |
* The return values is the number of bytes in the buffer that |
152 |
* were consumed. |
153 |
* |
154 |
* Side effects: |
155 |
* None. |
156 |
* |
157 |
*--------------------------------------------------------------------------- |
158 |
*/ |
159 |
|
160 |
INLINE int |
161 |
Tcl_UniCharToUtf(ch, str) |
162 |
int ch; /* The Tcl_UniChar to be stored in the |
163 |
* buffer. */ |
164 |
char *str; /* Buffer in which the UTF-8 representation |
165 |
* of the Tcl_UniChar is stored. Buffer must |
166 |
* be large enough to hold the UTF-8 character |
167 |
* (at most TCL_UTF_MAX bytes). */ |
168 |
{ |
169 |
if ((ch > 0) && (ch < UNICODE_SELF)) { |
170 |
str[0] = (char) ch; |
171 |
return 1; |
172 |
} |
173 |
if (ch <= 0x7FF) { |
174 |
str[1] = (char) ((ch | 0x80) & 0xBF); |
175 |
str[0] = (char) ((ch >> 6) | 0xC0); |
176 |
return 2; |
177 |
} |
178 |
if (ch <= 0xFFFF) { |
179 |
three: |
180 |
str[2] = (char) ((ch | 0x80) & 0xBF); |
181 |
str[1] = (char) (((ch >> 6) | 0x80) & 0xBF); |
182 |
str[0] = (char) ((ch >> 12) | 0xE0); |
183 |
return 3; |
184 |
} |
185 |
|
186 |
#if TCL_UTF_MAX > 3 |
187 |
if (ch <= 0x1FFFFF) { |
188 |
str[3] = (char) ((ch | 0x80) & 0xBF); |
189 |
str[2] = (char) (((ch >> 6) | 0x80) & 0xBF); |
190 |
str[1] = (char) (((ch >> 12) | 0x80) & 0xBF); |
191 |
str[0] = (char) ((ch >> 18) | 0xF0); |
192 |
return 4; |
193 |
} |
194 |
if (ch <= 0x3FFFFFF) { |
195 |
str[4] = (char) ((ch | 0x80) & 0xBF); |
196 |
str[3] = (char) (((ch >> 6) | 0x80) & 0xBF); |
197 |
str[2] = (char) (((ch >> 12) | 0x80) & 0xBF); |
198 |
str[1] = (char) (((ch >> 18) | 0x80) & 0xBF); |
199 |
str[0] = (char) ((ch >> 24) | 0xF8); |
200 |
return 5; |
201 |
} |
202 |
if (ch <= 0x7FFFFFFF) { |
203 |
str[5] = (char) ((ch | 0x80) & 0xBF); |
204 |
str[4] = (char) (((ch >> 6) | 0x80) & 0xBF); |
205 |
str[3] = (char) (((ch >> 12) | 0x80) & 0xBF); |
206 |
str[2] = (char) (((ch >> 18) | 0x80) & 0xBF); |
207 |
str[1] = (char) (((ch >> 24) | 0x80) & 0xBF); |
208 |
str[0] = (char) ((ch >> 30) | 0xFC); |
209 |
return 6; |
210 |
} |
211 |
#endif |
212 |
|
213 |
ch = 0xFFFD; |
214 |
goto three; |
215 |
} |
216 |
|
217 |
/* |
218 |
*--------------------------------------------------------------------------- |
219 |
* |
220 |
* Tcl_UniCharToUtfDString -- |
221 |
* |
222 |
* Convert the given Unicode string to UTF-8. |
223 |
* |
224 |
* Results: |
225 |
* The return value is a pointer to the UTF-8 representation of the |
226 |
* Unicode string. Storage for the return value is appended to the |
227 |
* end of dsPtr. |
228 |
* |
229 |
* Side effects: |
230 |
* None. |
231 |
* |
232 |
*--------------------------------------------------------------------------- |
233 |
*/ |
234 |
|
235 |
char * |
236 |
Tcl_UniCharToUtfDString(wString, numChars, dsPtr) |
237 |
CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */ |
238 |
int numChars; /* Length of Unicode string in Tcl_UniChars |
239 |
* (must be >= 0). */ |
240 |
Tcl_DString *dsPtr; /* UTF-8 representation of string is |
241 |
* appended to this previously initialized |
242 |
* DString. */ |
243 |
{ |
244 |
CONST Tcl_UniChar *w, *wEnd; |
245 |
char *p, *string; |
246 |
int oldLength; |
247 |
|
248 |
/* |
249 |
* UTF-8 string length in bytes will be <= Unicode string length * |
250 |
* TCL_UTF_MAX. |
251 |
*/ |
252 |
|
253 |
oldLength = Tcl_DStringLength(dsPtr); |
254 |
Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX); |
255 |
string = Tcl_DStringValue(dsPtr) + oldLength; |
256 |
|
257 |
p = string; |
258 |
wEnd = wString + numChars; |
259 |
for (w = wString; w < wEnd; ) { |
260 |
p += Tcl_UniCharToUtf(*w, p); |
261 |
w++; |
262 |
} |
263 |
Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); |
264 |
|
265 |
return string; |
266 |
} |
267 |
|
268 |
/* |
269 |
*--------------------------------------------------------------------------- |
270 |
* |
271 |
* Tcl_UtfToUniChar -- |
272 |
* |
273 |
* Extract the Tcl_UniChar represented by the UTF-8 string. Bad |
274 |
* UTF-8 sequences are converted to valid Tcl_UniChars and processing |
275 |
* continues. Equivalent to Plan 9 chartorune(). |
276 |
* |
277 |
* The caller must ensure that the source buffer is long enough that |
278 |
* this routine does not run off the end and dereference non-existent |
279 |
* memory looking for trail bytes. If the source buffer is known to |
280 |
* be '\0' terminated, this cannot happen. Otherwise, the caller |
281 |
* should call Tcl_UtfCharComplete() before calling this routine to |
282 |
* ensure that enough bytes remain in the string. |
283 |
* |
284 |
* Results: |
285 |
* *chPtr is filled with the Tcl_UniChar, and the return value is the |
286 |
* number of bytes from the UTF-8 string that were consumed. |
287 |
* |
288 |
* Side effects: |
289 |
* None. |
290 |
* |
291 |
*--------------------------------------------------------------------------- |
292 |
*/ |
293 |
|
294 |
int |
295 |
Tcl_UtfToUniChar(str, chPtr) |
296 |
register CONST char *str; /* The UTF-8 string. */ |
297 |
register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented |
298 |
* by the UTF-8 string. */ |
299 |
{ |
300 |
register int byte; |
301 |
|
302 |
/* |
303 |
* Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. |
304 |
*/ |
305 |
|
306 |
byte = *((unsigned char *) str); |
307 |
if (byte < 0xC0) { |
308 |
/* |
309 |
* Handles properly formed UTF-8 characters between 0x01 and 0x7F. |
310 |
* Also treats \0 and naked trail bytes 0x80 to 0xBF as valid |
311 |
* characters representing themselves. |
312 |
*/ |
313 |
|
314 |
*chPtr = (Tcl_UniChar) byte; |
315 |
return 1; |
316 |
} else if (byte < 0xE0) { |
317 |
if ((str[1] & 0xC0) == 0x80) { |
318 |
/* |
319 |
* Two-byte-character lead-byte followed by a trail-byte. |
320 |
*/ |
321 |
|
322 |
*chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F)); |
323 |
return 2; |
324 |
} |
325 |
/* |
326 |
* A two-byte-character lead-byte not followed by trail-byte |
327 |
* represents itself. |
328 |
*/ |
329 |
|
330 |
*chPtr = (Tcl_UniChar) byte; |
331 |
return 1; |
332 |
} else if (byte < 0xF0) { |
333 |
if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) { |
334 |
/* |
335 |
* Three-byte-character lead byte followed by two trail bytes. |
336 |
*/ |
337 |
|
338 |
*chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) |
339 |
| ((str[1] & 0x3F) << 6) | (str[2] & 0x3F)); |
340 |
return 3; |
341 |
} |
342 |
/* |
343 |
* A three-byte-character lead-byte not followed by two trail-bytes |
344 |
* represents itself. |
345 |
*/ |
346 |
|
347 |
*chPtr = (Tcl_UniChar) byte; |
348 |
return 1; |
349 |
} |
350 |
#if TCL_UTF_MAX > 3 |
351 |
else { |
352 |
int ch, total, trail; |
353 |
|
354 |
total = totalBytes[byte]; |
355 |
trail = total - 1; |
356 |
if (trail > 0) { |
357 |
ch = byte & (0x3F >> trail); |
358 |
do { |
359 |
str++; |
360 |
if ((*str & 0xC0) != 0x80) { |
361 |
*chPtr = byte; |
362 |
return 1; |
363 |
} |
364 |
ch <<= 6; |
365 |
ch |= (*str & 0x3F); |
366 |
trail--; |
367 |
} while (trail > 0); |
368 |
*chPtr = ch; |
369 |
return total; |
370 |
} |
371 |
} |
372 |
#endif |
373 |
|
374 |
*chPtr = (Tcl_UniChar) byte; |
375 |
return 1; |
376 |
} |
377 |
|
378 |
/* |
379 |
*--------------------------------------------------------------------------- |
380 |
* |
381 |
* Tcl_UtfToUniCharDString -- |
382 |
* |
383 |
* Convert the UTF-8 string to Unicode. |
384 |
* |
385 |
* Results: |
386 |
* The return value is a pointer to the Unicode representation of the |
387 |
* UTF-8 string. Storage for the return value is appended to the |
388 |
* end of dsPtr. The Unicode string is terminated with a Unicode |
389 |
* NULL character. |
390 |
* |
391 |
* Side effects: |
392 |
* None. |
393 |
* |
394 |
*--------------------------------------------------------------------------- |
395 |
*/ |
396 |
|
397 |
Tcl_UniChar * |
398 |
Tcl_UtfToUniCharDString(string, length, dsPtr) |
399 |
CONST char *string; /* UTF-8 string to convert to Unicode. */ |
400 |
int length; /* Length of UTF-8 string in bytes, or -1 |
401 |
* for strlen(). */ |
402 |
Tcl_DString *dsPtr; /* Unicode representation of string is |
403 |
* appended to this previously initialized |
404 |
* DString. */ |
405 |
{ |
406 |
Tcl_UniChar *w, *wString; |
407 |
CONST char *p, *end; |
408 |
int oldLength; |
409 |
|
410 |
if (length < 0) { |
411 |
length = strlen(string); |
412 |
} |
413 |
|
414 |
/* |
415 |
* Unicode string length in Tcl_UniChars will be <= UTF-8 string length |
416 |
* in bytes. |
417 |
*/ |
418 |
|
419 |
oldLength = Tcl_DStringLength(dsPtr); |
420 |
Tcl_DStringSetLength(dsPtr, |
421 |
(int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); |
422 |
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); |
423 |
|
424 |
w = wString; |
425 |
end = string + length; |
426 |
for (p = string; p < end; ) { |
427 |
p += Tcl_UtfToUniChar(p, w); |
428 |
w++; |
429 |
} |
430 |
*w = '\0'; |
431 |
Tcl_DStringSetLength(dsPtr, |
432 |
(oldLength + ((char *) w - (char *) wString))); |
433 |
|
434 |
return wString; |
435 |
} |
436 |
|
437 |
/* |
438 |
*--------------------------------------------------------------------------- |
439 |
* |
440 |
* Tcl_UtfCharComplete -- |
441 |
* |
442 |
* Determine if the UTF-8 string of the given length is long enough |
443 |
* to be decoded by Tcl_UtfToUniChar(). This does not ensure that the |
444 |
* UTF-8 string is properly formed. Equivalent to Plan 9 fullrune(). |
445 |
* |
446 |
* Results: |
447 |
* The return value is 0 if the string is not long enough, non-zero |
448 |
* otherwise. |
449 |
* |
450 |
* Side effects: |
451 |
* None. |
452 |
* |
453 |
*--------------------------------------------------------------------------- |
454 |
*/ |
455 |
|
456 |
int |
457 |
Tcl_UtfCharComplete(str, len) |
458 |
CONST char *str; /* String to check if first few bytes |
459 |
* contain a complete UTF-8 character. */ |
460 |
int len; /* Length of above string in bytes. */ |
461 |
{ |
462 |
int ch; |
463 |
|
464 |
ch = *((unsigned char *) str); |
465 |
return len >= totalBytes[ch]; |
466 |
} |
467 |
|
468 |
/* |
469 |
*--------------------------------------------------------------------------- |
470 |
* |
471 |
* Tcl_NumUtfChars -- |
472 |
* |
473 |
* Returns the number of characters (not bytes) in the UTF-8 string, |
474 |
* not including the terminating NULL byte. This is equivalent to |
475 |
* Plan 9 utflen() and utfnlen(). |
476 |
* |
477 |
* Results: |
478 |
* As above. |
479 |
* |
480 |
* Side effects: |
481 |
* None. |
482 |
* |
483 |
*--------------------------------------------------------------------------- |
484 |
*/ |
485 |
|
486 |
int |
487 |
Tcl_NumUtfChars(str, len) |
488 |
register CONST char *str; /* The UTF-8 string to measure. */ |
489 |
int len; /* The length of the string in bytes, or -1 |
490 |
* for strlen(string). */ |
491 |
{ |
492 |
Tcl_UniChar ch; |
493 |
register Tcl_UniChar *chPtr = &ch; |
494 |
register int n; |
495 |
int i; |
496 |
|
497 |
/* |
498 |
* The separate implementations are faster. |
499 |
*/ |
500 |
|
501 |
i = 0; |
502 |
if (len < 0) { |
503 |
while (1) { |
504 |
str += Tcl_UtfToUniChar(str, chPtr); |
505 |
if (ch == '\0') { |
506 |
break; |
507 |
} |
508 |
i++; |
509 |
} |
510 |
} else { |
511 |
while (len > 0) { |
512 |
n = Tcl_UtfToUniChar(str, chPtr); |
513 |
len -= n; |
514 |
str += n; |
515 |
i++; |
516 |
} |
517 |
} |
518 |
return i; |
519 |
} |
520 |
|
521 |
/* |
522 |
*--------------------------------------------------------------------------- |
523 |
* |
524 |
* Tcl_UtfFindFirst -- |
525 |
* |
526 |
* Returns a pointer to the first occurance of the given Tcl_UniChar |
527 |
* in the NULL-terminated UTF-8 string. The NULL terminator is |
528 |
* considered part of the UTF-8 string. Equivalent to Plan 9 |
529 |
* utfrune(). |
530 |
* |
531 |
* Results: |
532 |
* As above. If the Tcl_UniChar does not exist in the given string, |
533 |
* the return value is NULL. |
534 |
* |
535 |
* Side effects: |
536 |
* None. |
537 |
* |
538 |
*--------------------------------------------------------------------------- |
539 |
*/ |
540 |
char * |
541 |
Tcl_UtfFindFirst(string, ch) |
542 |
CONST char *string; /* The UTF-8 string to be searched. */ |
543 |
int ch; /* The Tcl_UniChar to search for. */ |
544 |
{ |
545 |
int len; |
546 |
Tcl_UniChar find; |
547 |
|
548 |
while (1) { |
549 |
len = Tcl_UtfToUniChar(string, &find); |
550 |
if (find == ch) { |
551 |
return (char *) string; |
552 |
} |
553 |
if (*string == '\0') { |
554 |
return NULL; |
555 |
} |
556 |
string += len; |
557 |
} |
558 |
} |
559 |
|
560 |
/* |
561 |
*--------------------------------------------------------------------------- |
562 |
* |
563 |
* Tcl_UtfFindLast -- |
564 |
* |
565 |
* Returns a pointer to the last occurance of the given Tcl_UniChar |
566 |
* in the NULL-terminated UTF-8 string. The NULL terminator is |
567 |
* considered part of the UTF-8 string. Equivalent to Plan 9 |
568 |
* utfrrune(). |
569 |
* |
570 |
* Results: |
571 |
* As above. If the Tcl_UniChar does not exist in the given string, |
572 |
* the return value is NULL. |
573 |
* |
574 |
* Side effects: |
575 |
* None. |
576 |
* |
577 |
*--------------------------------------------------------------------------- |
578 |
*/ |
579 |
|
580 |
char * |
581 |
Tcl_UtfFindLast(string, ch) |
582 |
CONST char *string; /* The UTF-8 string to be searched. */ |
583 |
int ch; /* The Tcl_UniChar to search for. */ |
584 |
{ |
585 |
int len; |
586 |
Tcl_UniChar find; |
587 |
CONST char *last; |
588 |
|
589 |
last = NULL; |
590 |
while (1) { |
591 |
len = Tcl_UtfToUniChar(string, &find); |
592 |
if (find == ch) { |
593 |
last = string; |
594 |
} |
595 |
if (*string == '\0') { |
596 |
break; |
597 |
} |
598 |
string += len; |
599 |
} |
600 |
return (char *) last; |
601 |
} |
602 |
|
603 |
/* |
604 |
*--------------------------------------------------------------------------- |
605 |
* |
606 |
* Tcl_UtfNext -- |
607 |
* |
608 |
* Given a pointer to some current location in a UTF-8 string, |
609 |
* move forward one character. The caller must ensure that they |
610 |
* are not asking for the next character after the last character |
611 |
* in the string. |
612 |
* |
613 |
* Results: |
614 |
* The return value is the pointer to the next character in |
615 |
* the UTF-8 string. |
616 |
* |
617 |
* Side effects: |
618 |
* None. |
619 |
* |
620 |
*--------------------------------------------------------------------------- |
621 |
*/ |
622 |
|
623 |
char * |
624 |
Tcl_UtfNext(str) |
625 |
CONST char *str; /* The current location in the string. */ |
626 |
{ |
627 |
Tcl_UniChar ch; |
628 |
|
629 |
return (char *) str + Tcl_UtfToUniChar(str, &ch); |
630 |
} |
631 |
|
632 |
/* |
633 |
*--------------------------------------------------------------------------- |
634 |
* |
635 |
* Tcl_UtfPrev -- |
636 |
* |
637 |
* Given a pointer to some current location in a UTF-8 string, |
638 |
* move backwards one character. |
639 |
* |
640 |
* Results: |
641 |
* The return value is a pointer to the previous character in the |
642 |
* UTF-8 string. If the current location was already at the |
643 |
* beginning of the string, the return value will also be a |
644 |
* pointer to the beginning of the string. |
645 |
* |
646 |
* Side effects: |
647 |
* None. |
648 |
* |
649 |
*--------------------------------------------------------------------------- |
650 |
*/ |
651 |
|
652 |
char * |
653 |
Tcl_UtfPrev(str, start) |
654 |
CONST char *str; /* The current location in the string. */ |
655 |
CONST char *start; /* Pointer to the beginning of the |
656 |
* string, to avoid going backwards too |
657 |
* far. */ |
658 |
{ |
659 |
CONST char *look; |
660 |
int i, byte; |
661 |
|
662 |
str--; |
663 |
look = str; |
664 |
for (i = 0; i < TCL_UTF_MAX; i++) { |
665 |
if (look < start) { |
666 |
if (str < start) { |
667 |
str = start; |
668 |
} |
669 |
break; |
670 |
} |
671 |
byte = *((unsigned char *) look); |
672 |
if (byte < 0x80) { |
673 |
break; |
674 |
} |
675 |
if (byte >= 0xC0) { |
676 |
if (totalBytes[byte] != i + 1) { |
677 |
break; |
678 |
} |
679 |
return (char *) look; |
680 |
} |
681 |
look--; |
682 |
} |
683 |
return (char *) str; |
684 |
} |
685 |
|
686 |
/* |
687 |
*--------------------------------------------------------------------------- |
688 |
* |
689 |
* Tcl_UniCharAtIndex -- |
690 |
* |
691 |
* Returns the Unicode character represented at the specified |
692 |
* character (not byte) position in the UTF-8 string. |
693 |
* |
694 |
* Results: |
695 |
* As above. |
696 |
* |
697 |
* Side effects: |
698 |
* None. |
699 |
* |
700 |
*--------------------------------------------------------------------------- |
701 |
*/ |
702 |
|
703 |
Tcl_UniChar |
704 |
Tcl_UniCharAtIndex(src, index) |
705 |
register CONST char *src; /* The UTF-8 string to dereference. */ |
706 |
register int index; /* The position of the desired character. */ |
707 |
{ |
708 |
Tcl_UniChar ch; |
709 |
|
710 |
while (index >= 0) { |
711 |
index--; |
712 |
src += Tcl_UtfToUniChar(src, &ch); |
713 |
} |
714 |
return ch; |
715 |
} |
716 |
|
717 |
/* |
718 |
*--------------------------------------------------------------------------- |
719 |
* |
720 |
* Tcl_UtfAtIndex -- |
721 |
* |
722 |
* Returns a pointer to the specified character (not byte) position |
723 |
* in the UTF-8 string. |
724 |
* |
725 |
* Results: |
726 |
* As above. |
727 |
* |
728 |
* Side effects: |
729 |
* None. |
730 |
* |
731 |
*--------------------------------------------------------------------------- |
732 |
*/ |
733 |
|
734 |
char * |
735 |
Tcl_UtfAtIndex(src, index) |
736 |
register CONST char *src; /* The UTF-8 string. */ |
737 |
register int index; /* The position of the desired character. */ |
738 |
{ |
739 |
Tcl_UniChar ch; |
740 |
|
741 |
while (index > 0) { |
742 |
index--; |
743 |
src += Tcl_UtfToUniChar(src, &ch); |
744 |
} |
745 |
return (char *) src; |
746 |
} |
747 |
|
748 |
/* |
749 |
*--------------------------------------------------------------------------- |
750 |
* |
751 |
* Tcl_UtfBackslash -- |
752 |
* |
753 |
* Figure out how to handle a backslash sequence. |
754 |
* |
755 |
* Results: |
756 |
* Stores the bytes represented by the backslash sequence in dst and |
757 |
* returns the number of bytes written to dst. At most TCL_UTF_MAX |
758 |
* bytes are written to dst; dst must have been large enough to accept |
759 |
* those bytes. If readPtr isn't NULL then it is filled in with a |
760 |
* count of the number of bytes in the backslash sequence. |
761 |
* |
762 |
* Side effects: |
763 |
* The maximum number of bytes it takes to represent a Unicode |
764 |
* character in UTF-8 is guaranteed to be less than the number of |
765 |
* bytes used to express the backslash sequence that represents |
766 |
* that Unicode character. If the target buffer into which the |
767 |
* caller is going to store the bytes that represent the Unicode |
768 |
* character is at least as large as the source buffer from which |
769 |
* the backslashed sequence was extracted, no buffer overruns should |
770 |
* occur. |
771 |
* |
772 |
*--------------------------------------------------------------------------- |
773 |
*/ |
774 |
|
775 |
int |
776 |
Tcl_UtfBackslash(src, readPtr, dst) |
777 |
CONST char *src; /* Points to the backslash character of |
778 |
* a backslash sequence. */ |
779 |
int *readPtr; /* Fill in with number of characters read |
780 |
* from src, unless NULL. */ |
781 |
char *dst; /* Filled with the bytes represented by the |
782 |
* backslash sequence. */ |
783 |
{ |
784 |
register CONST char *p = src+1; |
785 |
int result, count, n; |
786 |
char buf[TCL_UTF_MAX]; |
787 |
|
788 |
if (dst == NULL) { |
789 |
dst = buf; |
790 |
} |
791 |
|
792 |
count = 2; |
793 |
switch (*p) { |
794 |
/* |
795 |
* Note: in the conversions below, use absolute values (e.g., |
796 |
* 0xa) rather than symbolic values (e.g. \n) that get converted |
797 |
* by the compiler. It's possible that compilers on some |
798 |
* platforms will do the symbolic conversions differently, which |
799 |
* could result in non-portable Tcl scripts. |
800 |
*/ |
801 |
|
802 |
case 'a': |
803 |
result = 0x7; |
804 |
break; |
805 |
case 'b': |
806 |
result = 0x8; |
807 |
break; |
808 |
case 'f': |
809 |
result = 0xc; |
810 |
break; |
811 |
case 'n': |
812 |
result = 0xa; |
813 |
break; |
814 |
case 'r': |
815 |
result = 0xd; |
816 |
break; |
817 |
case 't': |
818 |
result = 0x9; |
819 |
break; |
820 |
case 'v': |
821 |
result = 0xb; |
822 |
break; |
823 |
case 'x': |
824 |
if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */ |
825 |
char *end; |
826 |
|
827 |
result = (unsigned char) strtoul(p+1, &end, 16); |
828 |
count = end - src; |
829 |
} else { |
830 |
count = 2; |
831 |
result = 'x'; |
832 |
} |
833 |
break; |
834 |
case 'u': |
835 |
result = 0; |
836 |
for (count = 0; count < 4; count++) { |
837 |
p++; |
838 |
if (!isxdigit(UCHAR(*p))) { /* INTL: digit */ |
839 |
break; |
840 |
} |
841 |
n = *p - '0'; |
842 |
if (n > 9) { |
843 |
n = n + '0' + 10 - 'A'; |
844 |
} |
845 |
if (n > 16) { |
846 |
n = n + 'A' - 'a'; |
847 |
} |
848 |
result = (result << 4) + n; |
849 |
} |
850 |
if (count == 0) { |
851 |
result = 'u'; |
852 |
} |
853 |
count += 2; |
854 |
break; |
855 |
|
856 |
case '\n': |
857 |
do { |
858 |
p++; |
859 |
} while ((*p == ' ') || (*p == '\t')); |
860 |
result = ' '; |
861 |
count = p - src; |
862 |
break; |
863 |
case 0: |
864 |
result = '\\'; |
865 |
count = 1; |
866 |
break; |
867 |
default: |
868 |
/* |
869 |
* Check for an octal number \oo?o? |
870 |
*/ |
871 |
if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */ |
872 |
result = (unsigned char)(*p - '0'); |
873 |
p++; |
874 |
if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ |
875 |
break; |
876 |
} |
877 |
count = 3; |
878 |
result = (unsigned char)((result << 3) + (*p - '0')); |
879 |
p++; |
880 |
if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */ |
881 |
break; |
882 |
} |
883 |
count = 4; |
884 |
result = (unsigned char)((result << 3) + (*p - '0')); |
885 |
break; |
886 |
} |
887 |
result = *p; |
888 |
count = 2; |
889 |
break; |
890 |
} |
891 |
|
892 |
if (readPtr != NULL) { |
893 |
*readPtr = count; |
894 |
} |
895 |
return Tcl_UniCharToUtf(result, dst); |
896 |
} |
897 |
|
898 |
/* |
899 |
*---------------------------------------------------------------------- |
900 |
* |
901 |
* Tcl_UtfToUpper -- |
902 |
* |
903 |
* Convert lowercase characters to uppercase characters in a UTF |
904 |
* string in place. The conversion may shrink the UTF string. |
905 |
* |
906 |
* Results: |
907 |
* Returns the number of bytes in the resulting string |
908 |
* excluding the trailing null. |
909 |
* |
910 |
* Side effects: |
911 |
* Writes a terminating null after the last converted character. |
912 |
* |
913 |
*---------------------------------------------------------------------- |
914 |
*/ |
915 |
|
916 |
int |
917 |
Tcl_UtfToUpper(str) |
918 |
char *str; /* String to convert in place. */ |
919 |
{ |
920 |
Tcl_UniChar ch, upChar; |
921 |
char *src, *dst; |
922 |
int bytes; |
923 |
|
924 |
/* |
925 |
* Iterate over the string until we hit the terminating null. |
926 |
*/ |
927 |
|
928 |
src = dst = str; |
929 |
while (*src) { |
930 |
bytes = Tcl_UtfToUniChar(src, &ch); |
931 |
upChar = Tcl_UniCharToUpper(ch); |
932 |
|
933 |
/* |
934 |
* To keep badly formed Utf strings from getting inflated by |
935 |
* the conversion (thereby causing a segfault), only copy the |
936 |
* upper case char to dst if its size is <= the original char. |
937 |
*/ |
938 |
|
939 |
if (bytes < UtfCount(upChar)) { |
940 |
memcpy(dst, src, (size_t) bytes); |
941 |
dst += bytes; |
942 |
} else { |
943 |
dst += Tcl_UniCharToUtf(upChar, dst); |
944 |
} |
945 |
src += bytes; |
946 |
} |
947 |
*dst = '\0'; |
948 |
return (dst - str); |
949 |
} |
950 |
|
951 |
/* |
952 |
*---------------------------------------------------------------------- |
953 |
* |
954 |
* Tcl_UtfToLower -- |
955 |
* |
956 |
* Convert uppercase characters to lowercase characters in a UTF |
957 |
* string in place. The conversion may shrink the UTF string. |
958 |
* |
959 |
* Results: |
960 |
* Returns the number of bytes in the resulting string |
961 |
* excluding the trailing null. |
962 |
* |
963 |
* Side effects: |
964 |
* Writes a terminating null after the last converted character. |
965 |
* |
966 |
*---------------------------------------------------------------------- |
967 |
*/ |
968 |
|
969 |
int |
970 |
Tcl_UtfToLower(str) |
971 |
char *str; /* String to convert in place. */ |
972 |
{ |
973 |
Tcl_UniChar ch, lowChar; |
974 |
char *src, *dst; |
975 |
int bytes; |
976 |
|
977 |
/* |
978 |
* Iterate over the string until we hit the terminating null. |
979 |
*/ |
980 |
|
981 |
src = dst = str; |
982 |
while (*src) { |
983 |
bytes = Tcl_UtfToUniChar(src, &ch); |
984 |
lowChar = Tcl_UniCharToLower(ch); |
985 |
|
986 |
/* |
987 |
* To keep badly formed Utf strings from getting inflated by |
988 |
* the conversion (thereby causing a segfault), only copy the |
989 |
* lower case char to dst if its size is <= the original char. |
990 |
*/ |
991 |
|
992 |
if (bytes < UtfCount(lowChar)) { |
993 |
memcpy(dst, src, (size_t) bytes); |
994 |
dst += bytes; |
995 |
} else { |
996 |
dst += Tcl_UniCharToUtf(lowChar, dst); |
997 |
} |
998 |
src += bytes; |
999 |
} |
1000 |
*dst = '\0'; |
1001 |
return (dst - str); |
1002 |
} |
1003 |
|
1004 |
/* |
1005 |
*---------------------------------------------------------------------- |
1006 |
* |
1007 |
* Tcl_UtfToTitle -- |
1008 |
* |
1009 |
* Changes the first character of a UTF string to title case or |
1010 |
* uppercase and the rest of the string to lowercase. The |
1011 |
* conversion happens in place and may shrink the UTF string. |
1012 |
* |
1013 |
* Results: |
1014 |
* Returns the number of bytes in the resulting string |
1015 |
* excluding the trailing null. |
1016 |
* |
1017 |
* Side effects: |
1018 |
* Writes a terminating null after the last converted character. |
1019 |
* |
1020 |
*---------------------------------------------------------------------- |
1021 |
*/ |
1022 |
|
1023 |
int |
1024 |
Tcl_UtfToTitle(str) |
1025 |
char *str; /* String to convert in place. */ |
1026 |
{ |
1027 |
Tcl_UniChar ch, titleChar, lowChar; |
1028 |
char *src, *dst; |
1029 |
int bytes; |
1030 |
|
1031 |
/* |
1032 |
* Capitalize the first character and then lowercase the rest of the |
1033 |
* characters until we get to a null. |
1034 |
*/ |
1035 |
|
1036 |
src = dst = str; |
1037 |
|
1038 |
if (*src) { |
1039 |
bytes = Tcl_UtfToUniChar(src, &ch); |
1040 |
titleChar = Tcl_UniCharToTitle(ch); |
1041 |
|
1042 |
if (bytes < UtfCount(titleChar)) { |
1043 |
memcpy(dst, src, (size_t) bytes); |
1044 |
dst += bytes; |
1045 |
} else { |
1046 |
dst += Tcl_UniCharToUtf(titleChar, dst); |
1047 |
} |
1048 |
src += bytes; |
1049 |
} |
1050 |
while (*src) { |
1051 |
bytes = Tcl_UtfToUniChar(src, &ch); |
1052 |
lowChar = Tcl_UniCharToLower(ch); |
1053 |
|
1054 |
if (bytes < UtfCount(lowChar)) { |
1055 |
memcpy(dst, src, (size_t) bytes); |
1056 |
dst += bytes; |
1057 |
} else { |
1058 |
dst += Tcl_UniCharToUtf(lowChar, dst); |
1059 |
} |
1060 |
src += bytes; |
1061 |
} |
1062 |
*dst = '\0'; |
1063 |
return (dst - str); |
1064 |
} |
1065 |
|
1066 |
/* |
1067 |
*---------------------------------------------------------------------- |
1068 |
* |
1069 |
* Tcl_UtfNcmp -- |
1070 |
* |
1071 |
* Compare at most n UTF chars of string cs to string ct. Both cs |
1072 |
* and ct are assumed to be at least n UTF chars long. |
1073 |
* |
1074 |
* Results: |
1075 |
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
1076 |
* |
1077 |
* Side effects: |
1078 |
* None. |
1079 |
* |
1080 |
*---------------------------------------------------------------------- |
1081 |
*/ |
1082 |
|
1083 |
int |
1084 |
Tcl_UtfNcmp(cs, ct, n) |
1085 |
CONST char *cs; /* UTF string to compare to ct. */ |
1086 |
CONST char *ct; /* UTF string cs is compared to. */ |
1087 |
unsigned long n; /* Number of UTF chars to compare. */ |
1088 |
{ |
1089 |
Tcl_UniChar ch1, ch2; |
1090 |
/* |
1091 |
* Another approach that should work is: |
1092 |
* return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs)); |
1093 |
* That assumes that ct is a properly formed UTF, so we will just |
1094 |
* be comparing the bytes that compromise those strings to the |
1095 |
* char length n. |
1096 |
*/ |
1097 |
while (n-- > 0) { |
1098 |
/* |
1099 |
* n must be interpreted as chars, not bytes. |
1100 |
* This should be called only when both strings are of |
1101 |
* at least n chars long (no need for \0 check) |
1102 |
*/ |
1103 |
cs += Tcl_UtfToUniChar(cs, &ch1); |
1104 |
ct += Tcl_UtfToUniChar(ct, &ch2); |
1105 |
if (ch1 != ch2) { |
1106 |
return (ch1 - ch2); |
1107 |
} |
1108 |
} |
1109 |
return 0; |
1110 |
} |
1111 |
|
1112 |
/* |
1113 |
*---------------------------------------------------------------------- |
1114 |
* |
1115 |
* Tcl_UtfNcasecmp -- |
1116 |
* |
1117 |
* Compare at most n UTF chars of string cs to string ct case |
1118 |
* insensitive. Both cs and ct are assumed to be at least n |
1119 |
* UTF chars long. |
1120 |
* |
1121 |
* Results: |
1122 |
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
1123 |
* |
1124 |
* Side effects: |
1125 |
* None. |
1126 |
* |
1127 |
*---------------------------------------------------------------------- |
1128 |
*/ |
1129 |
|
1130 |
int |
1131 |
Tcl_UtfNcasecmp(cs, ct, n) |
1132 |
CONST char *cs; /* UTF string to compare to ct. */ |
1133 |
CONST char *ct; /* UTF string cs is compared to. */ |
1134 |
unsigned long n; /* Number of UTF chars to compare. */ |
1135 |
{ |
1136 |
Tcl_UniChar ch1, ch2; |
1137 |
while (n-- > 0) { |
1138 |
/* |
1139 |
* n must be interpreted as chars, not bytes. |
1140 |
* This should be called only when both strings are of |
1141 |
* at least n chars long (no need for \0 check) |
1142 |
*/ |
1143 |
cs += Tcl_UtfToUniChar(cs, &ch1); |
1144 |
ct += Tcl_UtfToUniChar(ct, &ch2); |
1145 |
if (ch1 != ch2) { |
1146 |
ch1 = Tcl_UniCharToLower(ch1); |
1147 |
ch2 = Tcl_UniCharToLower(ch2); |
1148 |
if (ch1 != ch2) { |
1149 |
return (ch1 - ch2); |
1150 |
} |
1151 |
} |
1152 |
} |
1153 |
return 0; |
1154 |
} |
1155 |
|
1156 |
/* |
1157 |
*---------------------------------------------------------------------- |
1158 |
* |
1159 |
* Tcl_UniCharToUpper -- |
1160 |
* |
1161 |
* Compute the uppercase equivalent of the given Unicode character. |
1162 |
* |
1163 |
* Results: |
1164 |
* Returns the uppercase Unicode character. |
1165 |
* |
1166 |
* Side effects: |
1167 |
* None. |
1168 |
* |
1169 |
*---------------------------------------------------------------------- |
1170 |
*/ |
1171 |
|
1172 |
Tcl_UniChar |
1173 |
Tcl_UniCharToUpper(ch) |
1174 |
int ch; /* Unicode character to convert. */ |
1175 |
{ |
1176 |
int info = GetUniCharInfo(ch); |
1177 |
|
1178 |
if (GetCaseType(info) & 0x04) { |
1179 |
return (Tcl_UniChar) (ch - GetDelta(info)); |
1180 |
} else { |
1181 |
return ch; |
1182 |
} |
1183 |
} |
1184 |
|
1185 |
/* |
1186 |
*---------------------------------------------------------------------- |
1187 |
* |
1188 |
* Tcl_UniCharToLower -- |
1189 |
* |
1190 |
* Compute the lowercase equivalent of the given Unicode character. |
1191 |
* |
1192 |
* Results: |
1193 |
* Returns the lowercase Unicode character. |
1194 |
* |
1195 |
* Side effects: |
1196 |
* None. |
1197 |
* |
1198 |
*---------------------------------------------------------------------- |
1199 |
*/ |
1200 |
|
1201 |
Tcl_UniChar |
1202 |
Tcl_UniCharToLower(ch) |
1203 |
int ch; /* Unicode character to convert. */ |
1204 |
{ |
1205 |
int info = GetUniCharInfo(ch); |
1206 |
|
1207 |
if (GetCaseType(info) & 0x02) { |
1208 |
return (Tcl_UniChar) (ch + GetDelta(info)); |
1209 |
} else { |
1210 |
return ch; |
1211 |
} |
1212 |
} |
1213 |
|
1214 |
/* |
1215 |
*---------------------------------------------------------------------- |
1216 |
* |
1217 |
* Tcl_UniCharToTitle -- |
1218 |
* |
1219 |
* Compute the titlecase equivalent of the given Unicode character. |
1220 |
* |
1221 |
* Results: |
1222 |
* Returns the titlecase Unicode character. |
1223 |
* |
1224 |
* Side effects: |
1225 |
* None. |
1226 |
* |
1227 |
*---------------------------------------------------------------------- |
1228 |
*/ |
1229 |
|
1230 |
Tcl_UniChar |
1231 |
Tcl_UniCharToTitle(ch) |
1232 |
int ch; /* Unicode character to convert. */ |
1233 |
{ |
1234 |
int info = GetUniCharInfo(ch); |
1235 |
int mode = GetCaseType(info); |
1236 |
|
1237 |
if (mode & 0x1) { |
1238 |
/* |
1239 |
* Subtract or add one depending on the original case. |
1240 |
*/ |
1241 |
|
1242 |
return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); |
1243 |
} else if (mode == 0x4) { |
1244 |
return (Tcl_UniChar) (ch - GetDelta(info)); |
1245 |
} else { |
1246 |
return ch; |
1247 |
} |
1248 |
} |
1249 |
|
1250 |
/* |
1251 |
*---------------------------------------------------------------------- |
1252 |
* |
1253 |
* Tcl_UniCharLen -- |
1254 |
* |
1255 |
* Find the length of a UniChar string. The str input must be null |
1256 |
* terminated. |
1257 |
* |
1258 |
* Results: |
1259 |
* Returns the length of str in UniChars (not bytes). |
1260 |
* |
1261 |
* Side effects: |
1262 |
* None. |
1263 |
* |
1264 |
*---------------------------------------------------------------------- |
1265 |
*/ |
1266 |
|
1267 |
int |
1268 |
Tcl_UniCharLen(str) |
1269 |
Tcl_UniChar *str; /* Unicode string to find length of. */ |
1270 |
{ |
1271 |
int len = 0; |
1272 |
|
1273 |
while (*str != '\0') { |
1274 |
len++; |
1275 |
str++; |
1276 |
} |
1277 |
return len; |
1278 |
} |
1279 |
|
1280 |
/* |
1281 |
*---------------------------------------------------------------------- |
1282 |
* |
1283 |
* Tcl_UniCharNcmp -- |
1284 |
* |
1285 |
* Compare at most n unichars of string cs to string ct. Both cs |
1286 |
* and ct are assumed to be at least n unichars long. |
1287 |
* |
1288 |
* Results: |
1289 |
* Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. |
1290 |
* |
1291 |
* Side effects: |
1292 |
* None. |
1293 |
* |
1294 |
*---------------------------------------------------------------------- |
1295 |
*/ |
1296 |
|
1297 |
int |
1298 |
Tcl_UniCharNcmp(cs, ct, n) |
1299 |
CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */ |
1300 |
CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */ |
1301 |
unsigned long n; /* Number of unichars to compare. */ |
1302 |
{ |
1303 |
for ( ; n != 0; n--, cs++, ct++) { |
1304 |
if (*cs != *ct) { |
1305 |
return *cs - *ct; |
1306 |
} |
1307 |
if (*cs == '\0') { |
1308 |
break; |
1309 |
} |
1310 |
} |
1311 |
return 0; |
1312 |
} |
1313 |
|
1314 |
/* |
1315 |
*---------------------------------------------------------------------- |
1316 |
* |
1317 |
* Tcl_UniCharIsAlnum -- |
1318 |
* |
1319 |
* Test if a character is an alphanumeric Unicode character. |
1320 |
* |
1321 |
* Results: |
1322 |
* Returns 1 if character is alphanumeric. |
1323 |
* |
1324 |
* Side effects: |
1325 |
* None. |
1326 |
* |
1327 |
*---------------------------------------------------------------------- |
1328 |
*/ |
1329 |
|
1330 |
int |
1331 |
Tcl_UniCharIsAlnum(ch) |
1332 |
int ch; /* Unicode character to test. */ |
1333 |
{ |
1334 |
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1335 |
|
1336 |
return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); |
1337 |
} |
1338 |
|
1339 |
/* |
1340 |
*---------------------------------------------------------------------- |
1341 |
* |
1342 |
* Tcl_UniCharIsAlpha -- |
1343 |
* |
1344 |
* Test if a character is an alphabetic Unicode character. |
1345 |
* |
1346 |
* Results: |
1347 |
* Returns 1 if character is alphabetic. |
1348 |
* |
1349 |
* Side effects: |
1350 |
* None. |
1351 |
* |
1352 |
*---------------------------------------------------------------------- |
1353 |
*/ |
1354 |
|
1355 |
int |
1356 |
Tcl_UniCharIsAlpha(ch) |
1357 |
int ch; /* Unicode character to test. */ |
1358 |
{ |
1359 |
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1360 |
return ((ALPHA_BITS >> category) & 1); |
1361 |
} |
1362 |
|
1363 |
/* |
1364 |
*---------------------------------------------------------------------- |
1365 |
* |
1366 |
* Tcl_UniCharIsControl -- |
1367 |
* |
1368 |
* Test if a character is a Unicode control character. |
1369 |
* |
1370 |
* Results: |
1371 |
* Returns non-zero if character is a control. |
1372 |
* |
1373 |
* Side effects: |
1374 |
* None. |
1375 |
* |
1376 |
*---------------------------------------------------------------------- |
1377 |
*/ |
1378 |
|
1379 |
int |
1380 |
Tcl_UniCharIsControl(ch) |
1381 |
int ch; /* Unicode character to test. */ |
1382 |
{ |
1383 |
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); |
1384 |
} |
1385 |
|
1386 |
/* |
1387 |
*---------------------------------------------------------------------- |
1388 |
* |
1389 |
* Tcl_UniCharIsDigit -- |
1390 |
* |
1391 |
* Test if a character is a numeric Unicode character. |
1392 |
* |
1393 |
* Results: |
1394 |
* Returns non-zero if character is a digit. |
1395 |
* |
1396 |
* Side effects: |
1397 |
* None. |
1398 |
* |
1399 |
*---------------------------------------------------------------------- |
1400 |
*/ |
1401 |
|
1402 |
int |
1403 |
Tcl_UniCharIsDigit(ch) |
1404 |
int ch; /* Unicode character to test. */ |
1405 |
{ |
1406 |
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) |
1407 |
== DECIMAL_DIGIT_NUMBER); |
1408 |
} |
1409 |
|
1410 |
/* |
1411 |
*---------------------------------------------------------------------- |
1412 |
* |
1413 |
* Tcl_UniCharIsGraph -- |
1414 |
* |
1415 |
* Test if a character is any Unicode print character except space. |
1416 |
* |
1417 |
* Results: |
1418 |
* Returns non-zero if character is printable, but not space. |
1419 |
* |
1420 |
* Side effects: |
1421 |
* None. |
1422 |
* |
1423 |
*---------------------------------------------------------------------- |
1424 |
*/ |
1425 |
|
1426 |
int |
1427 |
Tcl_UniCharIsGraph(ch) |
1428 |
int ch; /* Unicode character to test. */ |
1429 |
{ |
1430 |
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1431 |
return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); |
1432 |
} |
1433 |
|
1434 |
/* |
1435 |
*---------------------------------------------------------------------- |
1436 |
* |
1437 |
* Tcl_UniCharIsLower -- |
1438 |
* |
1439 |
* Test if a character is a lowercase Unicode character. |
1440 |
* |
1441 |
* Results: |
1442 |
* Returns non-zero if character is lowercase. |
1443 |
* |
1444 |
* Side effects: |
1445 |
* None. |
1446 |
* |
1447 |
*---------------------------------------------------------------------- |
1448 |
*/ |
1449 |
|
1450 |
int |
1451 |
Tcl_UniCharIsLower(ch) |
1452 |
int ch; /* Unicode character to test. */ |
1453 |
{ |
1454 |
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); |
1455 |
} |
1456 |
|
1457 |
/* |
1458 |
*---------------------------------------------------------------------- |
1459 |
* |
1460 |
* Tcl_UniCharIsPrint -- |
1461 |
* |
1462 |
* Test if a character is a Unicode print character. |
1463 |
* |
1464 |
* Results: |
1465 |
* Returns non-zero if character is printable. |
1466 |
* |
1467 |
* Side effects: |
1468 |
* None. |
1469 |
* |
1470 |
*---------------------------------------------------------------------- |
1471 |
*/ |
1472 |
|
1473 |
int |
1474 |
Tcl_UniCharIsPrint(ch) |
1475 |
int ch; /* Unicode character to test. */ |
1476 |
{ |
1477 |
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1478 |
return ((PRINT_BITS >> category) & 1); |
1479 |
} |
1480 |
|
1481 |
/* |
1482 |
*---------------------------------------------------------------------- |
1483 |
* |
1484 |
* Tcl_UniCharIsPunct -- |
1485 |
* |
1486 |
* Test if a character is a Unicode punctuation character. |
1487 |
* |
1488 |
* Results: |
1489 |
* Returns non-zero if character is punct. |
1490 |
* |
1491 |
* Side effects: |
1492 |
* None. |
1493 |
* |
1494 |
*---------------------------------------------------------------------- |
1495 |
*/ |
1496 |
|
1497 |
int |
1498 |
Tcl_UniCharIsPunct(ch) |
1499 |
int ch; /* Unicode character to test. */ |
1500 |
{ |
1501 |
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1502 |
return ((PUNCT_BITS >> category) & 1); |
1503 |
} |
1504 |
|
1505 |
/* |
1506 |
*---------------------------------------------------------------------- |
1507 |
* |
1508 |
* Tcl_UniCharIsSpace -- |
1509 |
* |
1510 |
* Test if a character is a whitespace Unicode character. |
1511 |
* |
1512 |
* Results: |
1513 |
* Returns non-zero if character is a space. |
1514 |
* |
1515 |
* Side effects: |
1516 |
* None. |
1517 |
* |
1518 |
*---------------------------------------------------------------------- |
1519 |
*/ |
1520 |
|
1521 |
int |
1522 |
Tcl_UniCharIsSpace(ch) |
1523 |
int ch; /* Unicode character to test. */ |
1524 |
{ |
1525 |
register int category; |
1526 |
|
1527 |
/* |
1528 |
* If the character is within the first 127 characters, just use the |
1529 |
* standard C function, otherwise consult the Unicode table. |
1530 |
*/ |
1531 |
|
1532 |
if (ch < 0x80) { |
1533 |
return isspace(UCHAR(ch)); /* INTL: ISO space */ |
1534 |
} else { |
1535 |
category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1536 |
return ((SPACE_BITS >> category) & 1); |
1537 |
} |
1538 |
} |
1539 |
|
1540 |
/* |
1541 |
*---------------------------------------------------------------------- |
1542 |
* |
1543 |
* Tcl_UniCharIsUpper -- |
1544 |
* |
1545 |
* Test if a character is a uppercase Unicode character. |
1546 |
* |
1547 |
* Results: |
1548 |
* Returns non-zero if character is uppercase. |
1549 |
* |
1550 |
* Side effects: |
1551 |
* None. |
1552 |
* |
1553 |
*---------------------------------------------------------------------- |
1554 |
*/ |
1555 |
|
1556 |
int |
1557 |
Tcl_UniCharIsUpper(ch) |
1558 |
int ch; /* Unicode character to test. */ |
1559 |
{ |
1560 |
return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); |
1561 |
} |
1562 |
|
1563 |
/* |
1564 |
*---------------------------------------------------------------------- |
1565 |
* |
1566 |
* Tcl_UniCharIsWordChar -- |
1567 |
* |
1568 |
* Test if a character is alphanumeric or a connector punctuation |
1569 |
* mark. |
1570 |
* |
1571 |
* Results: |
1572 |
* Returns 1 if character is a word character. |
1573 |
* |
1574 |
* Side effects: |
1575 |
* None. |
1576 |
* |
1577 |
*---------------------------------------------------------------------- |
1578 |
*/ |
1579 |
|
1580 |
int |
1581 |
Tcl_UniCharIsWordChar(ch) |
1582 |
int ch; /* Unicode character to test. */ |
1583 |
{ |
1584 |
register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); |
1585 |
|
1586 |
return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); |
1587 |
} |
1588 |
|
1589 |
/* End of tclutf.c */ |