/[dtapublic]/projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c
ViewVC logotype

Contents of /projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 71 - (show annotations) (download)
Sat Nov 5 11:07:06 2016 UTC (8 years ago) by dashley
File MIME type: text/plain
File size: 37902 byte(s)
Set EOL properties appropriately to facilitate simultaneous Linux and Windows development.
1 /* $Header$ */
2 /*
3 * tclUtf.c --
4 *
5 * Routines for manipulating UTF-8 strings.
6 *
7 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
8 *
9 * See the file "license.terms" for information on usage and redistribution
10 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11 *
12 * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $
13 */
14
15 #include "tclInt.h"
16
17 /*
18 * Include the static character classification tables and macros.
19 */
20
21 #include "tclUniData.c"
22
23 /*
24 * The following macros are used for fast character category tests. The
25 * x_BITS values are shifted right by the category value to determine whether
26 * the given category is included in the set.
27 */
28
29 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
30 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
31
32 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
33
34 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
35 | (1 << PARAGRAPH_SEPARATOR))
36
37 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
38
39 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
40 (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
41 (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
42 (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
43 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
44 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
45 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
46 (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
47 (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
48
49 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
50 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
51 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
52 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
53
54 /*
55 * Unicode characters less than this value are represented by themselves
56 * in UTF-8 strings.
57 */
58
59 #define UNICODE_SELF 0x80
60
61 /*
62 * The following structures are used when mapping between Unicode (UCS-2)
63 * and UTF-8.
64 */
65
66 CONST unsigned char totalBytes[256] = {
67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
75 #if TCL_UTF_MAX > 3
76 4,4,4,4,4,4,4,4,
77 #else
78 1,1,1,1,1,1,1,1,
79 #endif
80 #if TCL_UTF_MAX > 4
81 5,5,5,5,
82 #else
83 1,1,1,1,
84 #endif
85 #if TCL_UTF_MAX > 5
86 6,6,6,6
87 #else
88 1,1,1,1
89 #endif
90 };
91
92 /*
93 * Procedures used only in this module.
94 */
95
96 static int UtfCount _ANSI_ARGS_((int ch));
97
98
99 /*
100 *---------------------------------------------------------------------------
101 *
102 * UtfCount --
103 *
104 * Find the number of bytes in the Utf character "ch".
105 *
106 * Results:
107 * The return values is the number of bytes in the Utf character "ch".
108 *
109 * Side effects:
110 * None.
111 *
112 *---------------------------------------------------------------------------
113 */
114
115 static int
116 UtfCount(ch)
117 int ch; /* The Tcl_UniChar whose size is returned. */
118 {
119 if ((ch > 0) && (ch < UNICODE_SELF)) {
120 return 1;
121 }
122 if (ch <= 0x7FF) {
123 return 2;
124 }
125 if (ch <= 0xFFFF) {
126 return 3;
127 }
128 #if TCL_UTF_MAX > 3
129 if (ch <= 0x1FFFFF) {
130 return 4;
131 }
132 if (ch <= 0x3FFFFFF) {
133 return 5;
134 }
135 if (ch <= 0x7FFFFFFF) {
136 return 6;
137 }
138 #endif
139 return 3;
140 }
141
142 /*
143 *---------------------------------------------------------------------------
144 *
145 * Tcl_UniCharToUtf --
146 *
147 * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
148 * provided buffer. Equivalent to Plan 9 runetochar().
149 *
150 * Results:
151 * The return values is the number of bytes in the buffer that
152 * were consumed.
153 *
154 * Side effects:
155 * None.
156 *
157 *---------------------------------------------------------------------------
158 */
159
160 INLINE int
161 Tcl_UniCharToUtf(ch, str)
162 int ch; /* The Tcl_UniChar to be stored in the
163 * buffer. */
164 char *str; /* Buffer in which the UTF-8 representation
165 * of the Tcl_UniChar is stored. Buffer must
166 * be large enough to hold the UTF-8 character
167 * (at most TCL_UTF_MAX bytes). */
168 {
169 if ((ch > 0) && (ch < UNICODE_SELF)) {
170 str[0] = (char) ch;
171 return 1;
172 }
173 if (ch <= 0x7FF) {
174 str[1] = (char) ((ch | 0x80) & 0xBF);
175 str[0] = (char) ((ch >> 6) | 0xC0);
176 return 2;
177 }
178 if (ch <= 0xFFFF) {
179 three:
180 str[2] = (char) ((ch | 0x80) & 0xBF);
181 str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
182 str[0] = (char) ((ch >> 12) | 0xE0);
183 return 3;
184 }
185
186 #if TCL_UTF_MAX > 3
187 if (ch <= 0x1FFFFF) {
188 str[3] = (char) ((ch | 0x80) & 0xBF);
189 str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
190 str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
191 str[0] = (char) ((ch >> 18) | 0xF0);
192 return 4;
193 }
194 if (ch <= 0x3FFFFFF) {
195 str[4] = (char) ((ch | 0x80) & 0xBF);
196 str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
197 str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
198 str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
199 str[0] = (char) ((ch >> 24) | 0xF8);
200 return 5;
201 }
202 if (ch <= 0x7FFFFFFF) {
203 str[5] = (char) ((ch | 0x80) & 0xBF);
204 str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
205 str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
206 str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
207 str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
208 str[0] = (char) ((ch >> 30) | 0xFC);
209 return 6;
210 }
211 #endif
212
213 ch = 0xFFFD;
214 goto three;
215 }
216
217 /*
218 *---------------------------------------------------------------------------
219 *
220 * Tcl_UniCharToUtfDString --
221 *
222 * Convert the given Unicode string to UTF-8.
223 *
224 * Results:
225 * The return value is a pointer to the UTF-8 representation of the
226 * Unicode string. Storage for the return value is appended to the
227 * end of dsPtr.
228 *
229 * Side effects:
230 * None.
231 *
232 *---------------------------------------------------------------------------
233 */
234
235 char *
236 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
237 CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
238 int numChars; /* Length of Unicode string in Tcl_UniChars
239 * (must be >= 0). */
240 Tcl_DString *dsPtr; /* UTF-8 representation of string is
241 * appended to this previously initialized
242 * DString. */
243 {
244 CONST Tcl_UniChar *w, *wEnd;
245 char *p, *string;
246 int oldLength;
247
248 /*
249 * UTF-8 string length in bytes will be <= Unicode string length *
250 * TCL_UTF_MAX.
251 */
252
253 oldLength = Tcl_DStringLength(dsPtr);
254 Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
255 string = Tcl_DStringValue(dsPtr) + oldLength;
256
257 p = string;
258 wEnd = wString + numChars;
259 for (w = wString; w < wEnd; ) {
260 p += Tcl_UniCharToUtf(*w, p);
261 w++;
262 }
263 Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
264
265 return string;
266 }
267
268 /*
269 *---------------------------------------------------------------------------
270 *
271 * Tcl_UtfToUniChar --
272 *
273 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
274 * UTF-8 sequences are converted to valid Tcl_UniChars and processing
275 * continues. Equivalent to Plan 9 chartorune().
276 *
277 * The caller must ensure that the source buffer is long enough that
278 * this routine does not run off the end and dereference non-existent
279 * memory looking for trail bytes. If the source buffer is known to
280 * be '\0' terminated, this cannot happen. Otherwise, the caller
281 * should call Tcl_UtfCharComplete() before calling this routine to
282 * ensure that enough bytes remain in the string.
283 *
284 * Results:
285 * *chPtr is filled with the Tcl_UniChar, and the return value is the
286 * number of bytes from the UTF-8 string that were consumed.
287 *
288 * Side effects:
289 * None.
290 *
291 *---------------------------------------------------------------------------
292 */
293
294 int
295 Tcl_UtfToUniChar(str, chPtr)
296 register CONST char *str; /* The UTF-8 string. */
297 register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
298 * by the UTF-8 string. */
299 {
300 register int byte;
301
302 /*
303 * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
304 */
305
306 byte = *((unsigned char *) str);
307 if (byte < 0xC0) {
308 /*
309 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
310 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
311 * characters representing themselves.
312 */
313
314 *chPtr = (Tcl_UniChar) byte;
315 return 1;
316 } else if (byte < 0xE0) {
317 if ((str[1] & 0xC0) == 0x80) {
318 /*
319 * Two-byte-character lead-byte followed by a trail-byte.
320 */
321
322 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
323 return 2;
324 }
325 /*
326 * A two-byte-character lead-byte not followed by trail-byte
327 * represents itself.
328 */
329
330 *chPtr = (Tcl_UniChar) byte;
331 return 1;
332 } else if (byte < 0xF0) {
333 if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
334 /*
335 * Three-byte-character lead byte followed by two trail bytes.
336 */
337
338 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339 | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
340 return 3;
341 }
342 /*
343 * A three-byte-character lead-byte not followed by two trail-bytes
344 * represents itself.
345 */
346
347 *chPtr = (Tcl_UniChar) byte;
348 return 1;
349 }
350 #if TCL_UTF_MAX > 3
351 else {
352 int ch, total, trail;
353
354 total = totalBytes[byte];
355 trail = total - 1;
356 if (trail > 0) {
357 ch = byte & (0x3F >> trail);
358 do {
359 str++;
360 if ((*str & 0xC0) != 0x80) {
361 *chPtr = byte;
362 return 1;
363 }
364 ch <<= 6;
365 ch |= (*str & 0x3F);
366 trail--;
367 } while (trail > 0);
368 *chPtr = ch;
369 return total;
370 }
371 }
372 #endif
373
374 *chPtr = (Tcl_UniChar) byte;
375 return 1;
376 }
377
378 /*
379 *---------------------------------------------------------------------------
380 *
381 * Tcl_UtfToUniCharDString --
382 *
383 * Convert the UTF-8 string to Unicode.
384 *
385 * Results:
386 * The return value is a pointer to the Unicode representation of the
387 * UTF-8 string. Storage for the return value is appended to the
388 * end of dsPtr. The Unicode string is terminated with a Unicode
389 * NULL character.
390 *
391 * Side effects:
392 * None.
393 *
394 *---------------------------------------------------------------------------
395 */
396
397 Tcl_UniChar *
398 Tcl_UtfToUniCharDString(string, length, dsPtr)
399 CONST char *string; /* UTF-8 string to convert to Unicode. */
400 int length; /* Length of UTF-8 string in bytes, or -1
401 * for strlen(). */
402 Tcl_DString *dsPtr; /* Unicode representation of string is
403 * appended to this previously initialized
404 * DString. */
405 {
406 Tcl_UniChar *w, *wString;
407 CONST char *p, *end;
408 int oldLength;
409
410 if (length < 0) {
411 length = strlen(string);
412 }
413
414 /*
415 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
416 * in bytes.
417 */
418
419 oldLength = Tcl_DStringLength(dsPtr);
420 Tcl_DStringSetLength(dsPtr,
421 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423
424 w = wString;
425 end = string + length;
426 for (p = string; p < end; ) {
427 p += Tcl_UtfToUniChar(p, w);
428 w++;
429 }
430 *w = '\0';
431 Tcl_DStringSetLength(dsPtr,
432 (oldLength + ((char *) w - (char *) wString)));
433
434 return wString;
435 }
436
437 /*
438 *---------------------------------------------------------------------------
439 *
440 * Tcl_UtfCharComplete --
441 *
442 * Determine if the UTF-8 string of the given length is long enough
443 * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
444 * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
445 *
446 * Results:
447 * The return value is 0 if the string is not long enough, non-zero
448 * otherwise.
449 *
450 * Side effects:
451 * None.
452 *
453 *---------------------------------------------------------------------------
454 */
455
456 int
457 Tcl_UtfCharComplete(str, len)
458 CONST char *str; /* String to check if first few bytes
459 * contain a complete UTF-8 character. */
460 int len; /* Length of above string in bytes. */
461 {
462 int ch;
463
464 ch = *((unsigned char *) str);
465 return len >= totalBytes[ch];
466 }
467
468 /*
469 *---------------------------------------------------------------------------
470 *
471 * Tcl_NumUtfChars --
472 *
473 * Returns the number of characters (not bytes) in the UTF-8 string,
474 * not including the terminating NULL byte. This is equivalent to
475 * Plan 9 utflen() and utfnlen().
476 *
477 * Results:
478 * As above.
479 *
480 * Side effects:
481 * None.
482 *
483 *---------------------------------------------------------------------------
484 */
485
486 int
487 Tcl_NumUtfChars(str, len)
488 register CONST char *str; /* The UTF-8 string to measure. */
489 int len; /* The length of the string in bytes, or -1
490 * for strlen(string). */
491 {
492 Tcl_UniChar ch;
493 register Tcl_UniChar *chPtr = &ch;
494 register int n;
495 int i;
496
497 /*
498 * The separate implementations are faster.
499 */
500
501 i = 0;
502 if (len < 0) {
503 while (1) {
504 str += Tcl_UtfToUniChar(str, chPtr);
505 if (ch == '\0') {
506 break;
507 }
508 i++;
509 }
510 } else {
511 while (len > 0) {
512 n = Tcl_UtfToUniChar(str, chPtr);
513 len -= n;
514 str += n;
515 i++;
516 }
517 }
518 return i;
519 }
520
521 /*
522 *---------------------------------------------------------------------------
523 *
524 * Tcl_UtfFindFirst --
525 *
526 * Returns a pointer to the first occurance of the given Tcl_UniChar
527 * in the NULL-terminated UTF-8 string. The NULL terminator is
528 * considered part of the UTF-8 string. Equivalent to Plan 9
529 * utfrune().
530 *
531 * Results:
532 * As above. If the Tcl_UniChar does not exist in the given string,
533 * the return value is NULL.
534 *
535 * Side effects:
536 * None.
537 *
538 *---------------------------------------------------------------------------
539 */
540 char *
541 Tcl_UtfFindFirst(string, ch)
542 CONST char *string; /* The UTF-8 string to be searched. */
543 int ch; /* The Tcl_UniChar to search for. */
544 {
545 int len;
546 Tcl_UniChar find;
547
548 while (1) {
549 len = Tcl_UtfToUniChar(string, &find);
550 if (find == ch) {
551 return (char *) string;
552 }
553 if (*string == '\0') {
554 return NULL;
555 }
556 string += len;
557 }
558 }
559
560 /*
561 *---------------------------------------------------------------------------
562 *
563 * Tcl_UtfFindLast --
564 *
565 * Returns a pointer to the last occurance of the given Tcl_UniChar
566 * in the NULL-terminated UTF-8 string. The NULL terminator is
567 * considered part of the UTF-8 string. Equivalent to Plan 9
568 * utfrrune().
569 *
570 * Results:
571 * As above. If the Tcl_UniChar does not exist in the given string,
572 * the return value is NULL.
573 *
574 * Side effects:
575 * None.
576 *
577 *---------------------------------------------------------------------------
578 */
579
580 char *
581 Tcl_UtfFindLast(string, ch)
582 CONST char *string; /* The UTF-8 string to be searched. */
583 int ch; /* The Tcl_UniChar to search for. */
584 {
585 int len;
586 Tcl_UniChar find;
587 CONST char *last;
588
589 last = NULL;
590 while (1) {
591 len = Tcl_UtfToUniChar(string, &find);
592 if (find == ch) {
593 last = string;
594 }
595 if (*string == '\0') {
596 break;
597 }
598 string += len;
599 }
600 return (char *) last;
601 }
602
603 /*
604 *---------------------------------------------------------------------------
605 *
606 * Tcl_UtfNext --
607 *
608 * Given a pointer to some current location in a UTF-8 string,
609 * move forward one character. The caller must ensure that they
610 * are not asking for the next character after the last character
611 * in the string.
612 *
613 * Results:
614 * The return value is the pointer to the next character in
615 * the UTF-8 string.
616 *
617 * Side effects:
618 * None.
619 *
620 *---------------------------------------------------------------------------
621 */
622
623 char *
624 Tcl_UtfNext(str)
625 CONST char *str; /* The current location in the string. */
626 {
627 Tcl_UniChar ch;
628
629 return (char *) str + Tcl_UtfToUniChar(str, &ch);
630 }
631
632 /*
633 *---------------------------------------------------------------------------
634 *
635 * Tcl_UtfPrev --
636 *
637 * Given a pointer to some current location in a UTF-8 string,
638 * move backwards one character.
639 *
640 * Results:
641 * The return value is a pointer to the previous character in the
642 * UTF-8 string. If the current location was already at the
643 * beginning of the string, the return value will also be a
644 * pointer to the beginning of the string.
645 *
646 * Side effects:
647 * None.
648 *
649 *---------------------------------------------------------------------------
650 */
651
652 char *
653 Tcl_UtfPrev(str, start)
654 CONST char *str; /* The current location in the string. */
655 CONST char *start; /* Pointer to the beginning of the
656 * string, to avoid going backwards too
657 * far. */
658 {
659 CONST char *look;
660 int i, byte;
661
662 str--;
663 look = str;
664 for (i = 0; i < TCL_UTF_MAX; i++) {
665 if (look < start) {
666 if (str < start) {
667 str = start;
668 }
669 break;
670 }
671 byte = *((unsigned char *) look);
672 if (byte < 0x80) {
673 break;
674 }
675 if (byte >= 0xC0) {
676 if (totalBytes[byte] != i + 1) {
677 break;
678 }
679 return (char *) look;
680 }
681 look--;
682 }
683 return (char *) str;
684 }
685
686 /*
687 *---------------------------------------------------------------------------
688 *
689 * Tcl_UniCharAtIndex --
690 *
691 * Returns the Unicode character represented at the specified
692 * character (not byte) position in the UTF-8 string.
693 *
694 * Results:
695 * As above.
696 *
697 * Side effects:
698 * None.
699 *
700 *---------------------------------------------------------------------------
701 */
702
703 Tcl_UniChar
704 Tcl_UniCharAtIndex(src, index)
705 register CONST char *src; /* The UTF-8 string to dereference. */
706 register int index; /* The position of the desired character. */
707 {
708 Tcl_UniChar ch;
709
710 while (index >= 0) {
711 index--;
712 src += Tcl_UtfToUniChar(src, &ch);
713 }
714 return ch;
715 }
716
717 /*
718 *---------------------------------------------------------------------------
719 *
720 * Tcl_UtfAtIndex --
721 *
722 * Returns a pointer to the specified character (not byte) position
723 * in the UTF-8 string.
724 *
725 * Results:
726 * As above.
727 *
728 * Side effects:
729 * None.
730 *
731 *---------------------------------------------------------------------------
732 */
733
734 char *
735 Tcl_UtfAtIndex(src, index)
736 register CONST char *src; /* The UTF-8 string. */
737 register int index; /* The position of the desired character. */
738 {
739 Tcl_UniChar ch;
740
741 while (index > 0) {
742 index--;
743 src += Tcl_UtfToUniChar(src, &ch);
744 }
745 return (char *) src;
746 }
747
748 /*
749 *---------------------------------------------------------------------------
750 *
751 * Tcl_UtfBackslash --
752 *
753 * Figure out how to handle a backslash sequence.
754 *
755 * Results:
756 * Stores the bytes represented by the backslash sequence in dst and
757 * returns the number of bytes written to dst. At most TCL_UTF_MAX
758 * bytes are written to dst; dst must have been large enough to accept
759 * those bytes. If readPtr isn't NULL then it is filled in with a
760 * count of the number of bytes in the backslash sequence.
761 *
762 * Side effects:
763 * The maximum number of bytes it takes to represent a Unicode
764 * character in UTF-8 is guaranteed to be less than the number of
765 * bytes used to express the backslash sequence that represents
766 * that Unicode character. If the target buffer into which the
767 * caller is going to store the bytes that represent the Unicode
768 * character is at least as large as the source buffer from which
769 * the backslashed sequence was extracted, no buffer overruns should
770 * occur.
771 *
772 *---------------------------------------------------------------------------
773 */
774
775 int
776 Tcl_UtfBackslash(src, readPtr, dst)
777 CONST char *src; /* Points to the backslash character of
778 * a backslash sequence. */
779 int *readPtr; /* Fill in with number of characters read
780 * from src, unless NULL. */
781 char *dst; /* Filled with the bytes represented by the
782 * backslash sequence. */
783 {
784 register CONST char *p = src+1;
785 int result, count, n;
786 char buf[TCL_UTF_MAX];
787
788 if (dst == NULL) {
789 dst = buf;
790 }
791
792 count = 2;
793 switch (*p) {
794 /*
795 * Note: in the conversions below, use absolute values (e.g.,
796 * 0xa) rather than symbolic values (e.g. \n) that get converted
797 * by the compiler. It's possible that compilers on some
798 * platforms will do the symbolic conversions differently, which
799 * could result in non-portable Tcl scripts.
800 */
801
802 case 'a':
803 result = 0x7;
804 break;
805 case 'b':
806 result = 0x8;
807 break;
808 case 'f':
809 result = 0xc;
810 break;
811 case 'n':
812 result = 0xa;
813 break;
814 case 'r':
815 result = 0xd;
816 break;
817 case 't':
818 result = 0x9;
819 break;
820 case 'v':
821 result = 0xb;
822 break;
823 case 'x':
824 if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
825 char *end;
826
827 result = (unsigned char) strtoul(p+1, &end, 16);
828 count = end - src;
829 } else {
830 count = 2;
831 result = 'x';
832 }
833 break;
834 case 'u':
835 result = 0;
836 for (count = 0; count < 4; count++) {
837 p++;
838 if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
839 break;
840 }
841 n = *p - '0';
842 if (n > 9) {
843 n = n + '0' + 10 - 'A';
844 }
845 if (n > 16) {
846 n = n + 'A' - 'a';
847 }
848 result = (result << 4) + n;
849 }
850 if (count == 0) {
851 result = 'u';
852 }
853 count += 2;
854 break;
855
856 case '\n':
857 do {
858 p++;
859 } while ((*p == ' ') || (*p == '\t'));
860 result = ' ';
861 count = p - src;
862 break;
863 case 0:
864 result = '\\';
865 count = 1;
866 break;
867 default:
868 /*
869 * Check for an octal number \oo?o?
870 */
871 if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
872 result = (unsigned char)(*p - '0');
873 p++;
874 if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
875 break;
876 }
877 count = 3;
878 result = (unsigned char)((result << 3) + (*p - '0'));
879 p++;
880 if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
881 break;
882 }
883 count = 4;
884 result = (unsigned char)((result << 3) + (*p - '0'));
885 break;
886 }
887 result = *p;
888 count = 2;
889 break;
890 }
891
892 if (readPtr != NULL) {
893 *readPtr = count;
894 }
895 return Tcl_UniCharToUtf(result, dst);
896 }
897
898 /*
899 *----------------------------------------------------------------------
900 *
901 * Tcl_UtfToUpper --
902 *
903 * Convert lowercase characters to uppercase characters in a UTF
904 * string in place. The conversion may shrink the UTF string.
905 *
906 * Results:
907 * Returns the number of bytes in the resulting string
908 * excluding the trailing null.
909 *
910 * Side effects:
911 * Writes a terminating null after the last converted character.
912 *
913 *----------------------------------------------------------------------
914 */
915
916 int
917 Tcl_UtfToUpper(str)
918 char *str; /* String to convert in place. */
919 {
920 Tcl_UniChar ch, upChar;
921 char *src, *dst;
922 int bytes;
923
924 /*
925 * Iterate over the string until we hit the terminating null.
926 */
927
928 src = dst = str;
929 while (*src) {
930 bytes = Tcl_UtfToUniChar(src, &ch);
931 upChar = Tcl_UniCharToUpper(ch);
932
933 /*
934 * To keep badly formed Utf strings from getting inflated by
935 * the conversion (thereby causing a segfault), only copy the
936 * upper case char to dst if its size is <= the original char.
937 */
938
939 if (bytes < UtfCount(upChar)) {
940 memcpy(dst, src, (size_t) bytes);
941 dst += bytes;
942 } else {
943 dst += Tcl_UniCharToUtf(upChar, dst);
944 }
945 src += bytes;
946 }
947 *dst = '\0';
948 return (dst - str);
949 }
950
951 /*
952 *----------------------------------------------------------------------
953 *
954 * Tcl_UtfToLower --
955 *
956 * Convert uppercase characters to lowercase characters in a UTF
957 * string in place. The conversion may shrink the UTF string.
958 *
959 * Results:
960 * Returns the number of bytes in the resulting string
961 * excluding the trailing null.
962 *
963 * Side effects:
964 * Writes a terminating null after the last converted character.
965 *
966 *----------------------------------------------------------------------
967 */
968
969 int
970 Tcl_UtfToLower(str)
971 char *str; /* String to convert in place. */
972 {
973 Tcl_UniChar ch, lowChar;
974 char *src, *dst;
975 int bytes;
976
977 /*
978 * Iterate over the string until we hit the terminating null.
979 */
980
981 src = dst = str;
982 while (*src) {
983 bytes = Tcl_UtfToUniChar(src, &ch);
984 lowChar = Tcl_UniCharToLower(ch);
985
986 /*
987 * To keep badly formed Utf strings from getting inflated by
988 * the conversion (thereby causing a segfault), only copy the
989 * lower case char to dst if its size is <= the original char.
990 */
991
992 if (bytes < UtfCount(lowChar)) {
993 memcpy(dst, src, (size_t) bytes);
994 dst += bytes;
995 } else {
996 dst += Tcl_UniCharToUtf(lowChar, dst);
997 }
998 src += bytes;
999 }
1000 *dst = '\0';
1001 return (dst - str);
1002 }
1003
1004 /*
1005 *----------------------------------------------------------------------
1006 *
1007 * Tcl_UtfToTitle --
1008 *
1009 * Changes the first character of a UTF string to title case or
1010 * uppercase and the rest of the string to lowercase. The
1011 * conversion happens in place and may shrink the UTF string.
1012 *
1013 * Results:
1014 * Returns the number of bytes in the resulting string
1015 * excluding the trailing null.
1016 *
1017 * Side effects:
1018 * Writes a terminating null after the last converted character.
1019 *
1020 *----------------------------------------------------------------------
1021 */
1022
1023 int
1024 Tcl_UtfToTitle(str)
1025 char *str; /* String to convert in place. */
1026 {
1027 Tcl_UniChar ch, titleChar, lowChar;
1028 char *src, *dst;
1029 int bytes;
1030
1031 /*
1032 * Capitalize the first character and then lowercase the rest of the
1033 * characters until we get to a null.
1034 */
1035
1036 src = dst = str;
1037
1038 if (*src) {
1039 bytes = Tcl_UtfToUniChar(src, &ch);
1040 titleChar = Tcl_UniCharToTitle(ch);
1041
1042 if (bytes < UtfCount(titleChar)) {
1043 memcpy(dst, src, (size_t) bytes);
1044 dst += bytes;
1045 } else {
1046 dst += Tcl_UniCharToUtf(titleChar, dst);
1047 }
1048 src += bytes;
1049 }
1050 while (*src) {
1051 bytes = Tcl_UtfToUniChar(src, &ch);
1052 lowChar = Tcl_UniCharToLower(ch);
1053
1054 if (bytes < UtfCount(lowChar)) {
1055 memcpy(dst, src, (size_t) bytes);
1056 dst += bytes;
1057 } else {
1058 dst += Tcl_UniCharToUtf(lowChar, dst);
1059 }
1060 src += bytes;
1061 }
1062 *dst = '\0';
1063 return (dst - str);
1064 }
1065
1066 /*
1067 *----------------------------------------------------------------------
1068 *
1069 * Tcl_UtfNcmp --
1070 *
1071 * Compare at most n UTF chars of string cs to string ct. Both cs
1072 * and ct are assumed to be at least n UTF chars long.
1073 *
1074 * Results:
1075 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1076 *
1077 * Side effects:
1078 * None.
1079 *
1080 *----------------------------------------------------------------------
1081 */
1082
1083 int
1084 Tcl_UtfNcmp(cs, ct, n)
1085 CONST char *cs; /* UTF string to compare to ct. */
1086 CONST char *ct; /* UTF string cs is compared to. */
1087 unsigned long n; /* Number of UTF chars to compare. */
1088 {
1089 Tcl_UniChar ch1, ch2;
1090 /*
1091 * Another approach that should work is:
1092 * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
1093 * That assumes that ct is a properly formed UTF, so we will just
1094 * be comparing the bytes that compromise those strings to the
1095 * char length n.
1096 */
1097 while (n-- > 0) {
1098 /*
1099 * n must be interpreted as chars, not bytes.
1100 * This should be called only when both strings are of
1101 * at least n chars long (no need for \0 check)
1102 */
1103 cs += Tcl_UtfToUniChar(cs, &ch1);
1104 ct += Tcl_UtfToUniChar(ct, &ch2);
1105 if (ch1 != ch2) {
1106 return (ch1 - ch2);
1107 }
1108 }
1109 return 0;
1110 }
1111
1112 /*
1113 *----------------------------------------------------------------------
1114 *
1115 * Tcl_UtfNcasecmp --
1116 *
1117 * Compare at most n UTF chars of string cs to string ct case
1118 * insensitive. Both cs and ct are assumed to be at least n
1119 * UTF chars long.
1120 *
1121 * Results:
1122 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1123 *
1124 * Side effects:
1125 * None.
1126 *
1127 *----------------------------------------------------------------------
1128 */
1129
1130 int
1131 Tcl_UtfNcasecmp(cs, ct, n)
1132 CONST char *cs; /* UTF string to compare to ct. */
1133 CONST char *ct; /* UTF string cs is compared to. */
1134 unsigned long n; /* Number of UTF chars to compare. */
1135 {
1136 Tcl_UniChar ch1, ch2;
1137 while (n-- > 0) {
1138 /*
1139 * n must be interpreted as chars, not bytes.
1140 * This should be called only when both strings are of
1141 * at least n chars long (no need for \0 check)
1142 */
1143 cs += Tcl_UtfToUniChar(cs, &ch1);
1144 ct += Tcl_UtfToUniChar(ct, &ch2);
1145 if (ch1 != ch2) {
1146 ch1 = Tcl_UniCharToLower(ch1);
1147 ch2 = Tcl_UniCharToLower(ch2);
1148 if (ch1 != ch2) {
1149 return (ch1 - ch2);
1150 }
1151 }
1152 }
1153 return 0;
1154 }
1155
1156 /*
1157 *----------------------------------------------------------------------
1158 *
1159 * Tcl_UniCharToUpper --
1160 *
1161 * Compute the uppercase equivalent of the given Unicode character.
1162 *
1163 * Results:
1164 * Returns the uppercase Unicode character.
1165 *
1166 * Side effects:
1167 * None.
1168 *
1169 *----------------------------------------------------------------------
1170 */
1171
1172 Tcl_UniChar
1173 Tcl_UniCharToUpper(ch)
1174 int ch; /* Unicode character to convert. */
1175 {
1176 int info = GetUniCharInfo(ch);
1177
1178 if (GetCaseType(info) & 0x04) {
1179 return (Tcl_UniChar) (ch - GetDelta(info));
1180 } else {
1181 return ch;
1182 }
1183 }
1184
1185 /*
1186 *----------------------------------------------------------------------
1187 *
1188 * Tcl_UniCharToLower --
1189 *
1190 * Compute the lowercase equivalent of the given Unicode character.
1191 *
1192 * Results:
1193 * Returns the lowercase Unicode character.
1194 *
1195 * Side effects:
1196 * None.
1197 *
1198 *----------------------------------------------------------------------
1199 */
1200
1201 Tcl_UniChar
1202 Tcl_UniCharToLower(ch)
1203 int ch; /* Unicode character to convert. */
1204 {
1205 int info = GetUniCharInfo(ch);
1206
1207 if (GetCaseType(info) & 0x02) {
1208 return (Tcl_UniChar) (ch + GetDelta(info));
1209 } else {
1210 return ch;
1211 }
1212 }
1213
1214 /*
1215 *----------------------------------------------------------------------
1216 *
1217 * Tcl_UniCharToTitle --
1218 *
1219 * Compute the titlecase equivalent of the given Unicode character.
1220 *
1221 * Results:
1222 * Returns the titlecase Unicode character.
1223 *
1224 * Side effects:
1225 * None.
1226 *
1227 *----------------------------------------------------------------------
1228 */
1229
1230 Tcl_UniChar
1231 Tcl_UniCharToTitle(ch)
1232 int ch; /* Unicode character to convert. */
1233 {
1234 int info = GetUniCharInfo(ch);
1235 int mode = GetCaseType(info);
1236
1237 if (mode & 0x1) {
1238 /*
1239 * Subtract or add one depending on the original case.
1240 */
1241
1242 return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1243 } else if (mode == 0x4) {
1244 return (Tcl_UniChar) (ch - GetDelta(info));
1245 } else {
1246 return ch;
1247 }
1248 }
1249
1250 /*
1251 *----------------------------------------------------------------------
1252 *
1253 * Tcl_UniCharLen --
1254 *
1255 * Find the length of a UniChar string. The str input must be null
1256 * terminated.
1257 *
1258 * Results:
1259 * Returns the length of str in UniChars (not bytes).
1260 *
1261 * Side effects:
1262 * None.
1263 *
1264 *----------------------------------------------------------------------
1265 */
1266
1267 int
1268 Tcl_UniCharLen(str)
1269 Tcl_UniChar *str; /* Unicode string to find length of. */
1270 {
1271 int len = 0;
1272
1273 while (*str != '\0') {
1274 len++;
1275 str++;
1276 }
1277 return len;
1278 }
1279
1280 /*
1281 *----------------------------------------------------------------------
1282 *
1283 * Tcl_UniCharNcmp --
1284 *
1285 * Compare at most n unichars of string cs to string ct. Both cs
1286 * and ct are assumed to be at least n unichars long.
1287 *
1288 * Results:
1289 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1290 *
1291 * Side effects:
1292 * None.
1293 *
1294 *----------------------------------------------------------------------
1295 */
1296
1297 int
1298 Tcl_UniCharNcmp(cs, ct, n)
1299 CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1300 CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1301 unsigned long n; /* Number of unichars to compare. */
1302 {
1303 for ( ; n != 0; n--, cs++, ct++) {
1304 if (*cs != *ct) {
1305 return *cs - *ct;
1306 }
1307 if (*cs == '\0') {
1308 break;
1309 }
1310 }
1311 return 0;
1312 }
1313
1314 /*
1315 *----------------------------------------------------------------------
1316 *
1317 * Tcl_UniCharIsAlnum --
1318 *
1319 * Test if a character is an alphanumeric Unicode character.
1320 *
1321 * Results:
1322 * Returns 1 if character is alphanumeric.
1323 *
1324 * Side effects:
1325 * None.
1326 *
1327 *----------------------------------------------------------------------
1328 */
1329
1330 int
1331 Tcl_UniCharIsAlnum(ch)
1332 int ch; /* Unicode character to test. */
1333 {
1334 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1335
1336 return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1337 }
1338
1339 /*
1340 *----------------------------------------------------------------------
1341 *
1342 * Tcl_UniCharIsAlpha --
1343 *
1344 * Test if a character is an alphabetic Unicode character.
1345 *
1346 * Results:
1347 * Returns 1 if character is alphabetic.
1348 *
1349 * Side effects:
1350 * None.
1351 *
1352 *----------------------------------------------------------------------
1353 */
1354
1355 int
1356 Tcl_UniCharIsAlpha(ch)
1357 int ch; /* Unicode character to test. */
1358 {
1359 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1360 return ((ALPHA_BITS >> category) & 1);
1361 }
1362
1363 /*
1364 *----------------------------------------------------------------------
1365 *
1366 * Tcl_UniCharIsControl --
1367 *
1368 * Test if a character is a Unicode control character.
1369 *
1370 * Results:
1371 * Returns non-zero if character is a control.
1372 *
1373 * Side effects:
1374 * None.
1375 *
1376 *----------------------------------------------------------------------
1377 */
1378
1379 int
1380 Tcl_UniCharIsControl(ch)
1381 int ch; /* Unicode character to test. */
1382 {
1383 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1384 }
1385
1386 /*
1387 *----------------------------------------------------------------------
1388 *
1389 * Tcl_UniCharIsDigit --
1390 *
1391 * Test if a character is a numeric Unicode character.
1392 *
1393 * Results:
1394 * Returns non-zero if character is a digit.
1395 *
1396 * Side effects:
1397 * None.
1398 *
1399 *----------------------------------------------------------------------
1400 */
1401
1402 int
1403 Tcl_UniCharIsDigit(ch)
1404 int ch; /* Unicode character to test. */
1405 {
1406 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1407 == DECIMAL_DIGIT_NUMBER);
1408 }
1409
1410 /*
1411 *----------------------------------------------------------------------
1412 *
1413 * Tcl_UniCharIsGraph --
1414 *
1415 * Test if a character is any Unicode print character except space.
1416 *
1417 * Results:
1418 * Returns non-zero if character is printable, but not space.
1419 *
1420 * Side effects:
1421 * None.
1422 *
1423 *----------------------------------------------------------------------
1424 */
1425
1426 int
1427 Tcl_UniCharIsGraph(ch)
1428 int ch; /* Unicode character to test. */
1429 {
1430 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1431 return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1432 }
1433
1434 /*
1435 *----------------------------------------------------------------------
1436 *
1437 * Tcl_UniCharIsLower --
1438 *
1439 * Test if a character is a lowercase Unicode character.
1440 *
1441 * Results:
1442 * Returns non-zero if character is lowercase.
1443 *
1444 * Side effects:
1445 * None.
1446 *
1447 *----------------------------------------------------------------------
1448 */
1449
1450 int
1451 Tcl_UniCharIsLower(ch)
1452 int ch; /* Unicode character to test. */
1453 {
1454 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1455 }
1456
1457 /*
1458 *----------------------------------------------------------------------
1459 *
1460 * Tcl_UniCharIsPrint --
1461 *
1462 * Test if a character is a Unicode print character.
1463 *
1464 * Results:
1465 * Returns non-zero if character is printable.
1466 *
1467 * Side effects:
1468 * None.
1469 *
1470 *----------------------------------------------------------------------
1471 */
1472
1473 int
1474 Tcl_UniCharIsPrint(ch)
1475 int ch; /* Unicode character to test. */
1476 {
1477 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1478 return ((PRINT_BITS >> category) & 1);
1479 }
1480
1481 /*
1482 *----------------------------------------------------------------------
1483 *
1484 * Tcl_UniCharIsPunct --
1485 *
1486 * Test if a character is a Unicode punctuation character.
1487 *
1488 * Results:
1489 * Returns non-zero if character is punct.
1490 *
1491 * Side effects:
1492 * None.
1493 *
1494 *----------------------------------------------------------------------
1495 */
1496
1497 int
1498 Tcl_UniCharIsPunct(ch)
1499 int ch; /* Unicode character to test. */
1500 {
1501 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1502 return ((PUNCT_BITS >> category) & 1);
1503 }
1504
1505 /*
1506 *----------------------------------------------------------------------
1507 *
1508 * Tcl_UniCharIsSpace --
1509 *
1510 * Test if a character is a whitespace Unicode character.
1511 *
1512 * Results:
1513 * Returns non-zero if character is a space.
1514 *
1515 * Side effects:
1516 * None.
1517 *
1518 *----------------------------------------------------------------------
1519 */
1520
1521 int
1522 Tcl_UniCharIsSpace(ch)
1523 int ch; /* Unicode character to test. */
1524 {
1525 register int category;
1526
1527 /*
1528 * If the character is within the first 127 characters, just use the
1529 * standard C function, otherwise consult the Unicode table.
1530 */
1531
1532 if (ch < 0x80) {
1533 return isspace(UCHAR(ch)); /* INTL: ISO space */
1534 } else {
1535 category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1536 return ((SPACE_BITS >> category) & 1);
1537 }
1538 }
1539
1540 /*
1541 *----------------------------------------------------------------------
1542 *
1543 * Tcl_UniCharIsUpper --
1544 *
1545 * Test if a character is a uppercase Unicode character.
1546 *
1547 * Results:
1548 * Returns non-zero if character is uppercase.
1549 *
1550 * Side effects:
1551 * None.
1552 *
1553 *----------------------------------------------------------------------
1554 */
1555
1556 int
1557 Tcl_UniCharIsUpper(ch)
1558 int ch; /* Unicode character to test. */
1559 {
1560 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1561 }
1562
1563 /*
1564 *----------------------------------------------------------------------
1565 *
1566 * Tcl_UniCharIsWordChar --
1567 *
1568 * Test if a character is alphanumeric or a connector punctuation
1569 * mark.
1570 *
1571 * Results:
1572 * Returns 1 if character is a word character.
1573 *
1574 * Side effects:
1575 * None.
1576 *
1577 *----------------------------------------------------------------------
1578 */
1579
1580 int
1581 Tcl_UniCharIsWordChar(ch)
1582 int ch; /* Unicode character to test. */
1583 {
1584 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1585
1586 return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1587 }
1588
1589 /* End of tclutf.c */

Properties

Name Value
svn:eol-style native
svn:keywords Header

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25