/[dtapublic]/projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c
ViewVC logotype

Contents of /projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 64 - (show annotations) (download)
Sun Oct 30 04:21:11 2016 UTC (7 years, 11 months ago) by dashley
File MIME type: text/plain
File size: 39715 byte(s)
Adjust line endings to Windows style.
Set properties to expand the "Header" keyword.
Change header and footer.
1 /* $Header$ */
2
3 /*
4 * tclUtf.c --
5 *
6 * Routines for manipulating UTF-8 strings.
7 *
8 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
9 *
10 * See the file "license.terms" for information on usage and redistribution
11 * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
12 *
13 * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $
14 */
15
16 #include "tclInt.h"
17
18 /*
19 * Include the static character classification tables and macros.
20 */
21
22 #include "tclUniData.c"
23
24 /*
25 * The following macros are used for fast character category tests. The
26 * x_BITS values are shifted right by the category value to determine whether
27 * the given category is included in the set.
28 */
29
30 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
31 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
32
33 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
34
35 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
36 | (1 << PARAGRAPH_SEPARATOR))
37
38 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
39
40 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
41 (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
42 (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
43 (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
44 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
45 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
46 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
47 (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
48 (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
49
50 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
51 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
52 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
53 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
54
55 /*
56 * Unicode characters less than this value are represented by themselves
57 * in UTF-8 strings.
58 */
59
60 #define UNICODE_SELF 0x80
61
62 /*
63 * The following structures are used when mapping between Unicode (UCS-2)
64 * and UTF-8.
65 */
66
67 CONST unsigned char totalBytes[256] = {
68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
74 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
75 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76 #if TCL_UTF_MAX > 3
77 4,4,4,4,4,4,4,4,
78 #else
79 1,1,1,1,1,1,1,1,
80 #endif
81 #if TCL_UTF_MAX > 4
82 5,5,5,5,
83 #else
84 1,1,1,1,
85 #endif
86 #if TCL_UTF_MAX > 5
87 6,6,6,6
88 #else
89 1,1,1,1
90 #endif
91 };
92
93 /*
94 * Procedures used only in this module.
95 */
96
97 static int UtfCount _ANSI_ARGS_((int ch));
98
99
100 /*
101 *---------------------------------------------------------------------------
102 *
103 * UtfCount --
104 *
105 * Find the number of bytes in the Utf character "ch".
106 *
107 * Results:
108 * The return values is the number of bytes in the Utf character "ch".
109 *
110 * Side effects:
111 * None.
112 *
113 *---------------------------------------------------------------------------
114 */
115
116 static int
117 UtfCount(ch)
118 int ch; /* The Tcl_UniChar whose size is returned. */
119 {
120 if ((ch > 0) && (ch < UNICODE_SELF)) {
121 return 1;
122 }
123 if (ch <= 0x7FF) {
124 return 2;
125 }
126 if (ch <= 0xFFFF) {
127 return 3;
128 }
129 #if TCL_UTF_MAX > 3
130 if (ch <= 0x1FFFFF) {
131 return 4;
132 }
133 if (ch <= 0x3FFFFFF) {
134 return 5;
135 }
136 if (ch <= 0x7FFFFFFF) {
137 return 6;
138 }
139 #endif
140 return 3;
141 }
142
143 /*
144 *---------------------------------------------------------------------------
145 *
146 * Tcl_UniCharToUtf --
147 *
148 * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
149 * provided buffer. Equivalent to Plan 9 runetochar().
150 *
151 * Results:
152 * The return values is the number of bytes in the buffer that
153 * were consumed.
154 *
155 * Side effects:
156 * None.
157 *
158 *---------------------------------------------------------------------------
159 */
160
161 INLINE int
162 Tcl_UniCharToUtf(ch, str)
163 int ch; /* The Tcl_UniChar to be stored in the
164 * buffer. */
165 char *str; /* Buffer in which the UTF-8 representation
166 * of the Tcl_UniChar is stored. Buffer must
167 * be large enough to hold the UTF-8 character
168 * (at most TCL_UTF_MAX bytes). */
169 {
170 if ((ch > 0) && (ch < UNICODE_SELF)) {
171 str[0] = (char) ch;
172 return 1;
173 }
174 if (ch <= 0x7FF) {
175 str[1] = (char) ((ch | 0x80) & 0xBF);
176 str[0] = (char) ((ch >> 6) | 0xC0);
177 return 2;
178 }
179 if (ch <= 0xFFFF) {
180 three:
181 str[2] = (char) ((ch | 0x80) & 0xBF);
182 str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
183 str[0] = (char) ((ch >> 12) | 0xE0);
184 return 3;
185 }
186
187 #if TCL_UTF_MAX > 3
188 if (ch <= 0x1FFFFF) {
189 str[3] = (char) ((ch | 0x80) & 0xBF);
190 str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
191 str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
192 str[0] = (char) ((ch >> 18) | 0xF0);
193 return 4;
194 }
195 if (ch <= 0x3FFFFFF) {
196 str[4] = (char) ((ch | 0x80) & 0xBF);
197 str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
198 str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
199 str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
200 str[0] = (char) ((ch >> 24) | 0xF8);
201 return 5;
202 }
203 if (ch <= 0x7FFFFFFF) {
204 str[5] = (char) ((ch | 0x80) & 0xBF);
205 str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
206 str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
207 str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
208 str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
209 str[0] = (char) ((ch >> 30) | 0xFC);
210 return 6;
211 }
212 #endif
213
214 ch = 0xFFFD;
215 goto three;
216 }
217
218 /*
219 *---------------------------------------------------------------------------
220 *
221 * Tcl_UniCharToUtfDString --
222 *
223 * Convert the given Unicode string to UTF-8.
224 *
225 * Results:
226 * The return value is a pointer to the UTF-8 representation of the
227 * Unicode string. Storage for the return value is appended to the
228 * end of dsPtr.
229 *
230 * Side effects:
231 * None.
232 *
233 *---------------------------------------------------------------------------
234 */
235
236 char *
237 Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
238 CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
239 int numChars; /* Length of Unicode string in Tcl_UniChars
240 * (must be >= 0). */
241 Tcl_DString *dsPtr; /* UTF-8 representation of string is
242 * appended to this previously initialized
243 * DString. */
244 {
245 CONST Tcl_UniChar *w, *wEnd;
246 char *p, *string;
247 int oldLength;
248
249 /*
250 * UTF-8 string length in bytes will be <= Unicode string length *
251 * TCL_UTF_MAX.
252 */
253
254 oldLength = Tcl_DStringLength(dsPtr);
255 Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
256 string = Tcl_DStringValue(dsPtr) + oldLength;
257
258 p = string;
259 wEnd = wString + numChars;
260 for (w = wString; w < wEnd; ) {
261 p += Tcl_UniCharToUtf(*w, p);
262 w++;
263 }
264 Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
265
266 return string;
267 }
268
269 /*
270 *---------------------------------------------------------------------------
271 *
272 * Tcl_UtfToUniChar --
273 *
274 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
275 * UTF-8 sequences are converted to valid Tcl_UniChars and processing
276 * continues. Equivalent to Plan 9 chartorune().
277 *
278 * The caller must ensure that the source buffer is long enough that
279 * this routine does not run off the end and dereference non-existent
280 * memory looking for trail bytes. If the source buffer is known to
281 * be '\0' terminated, this cannot happen. Otherwise, the caller
282 * should call Tcl_UtfCharComplete() before calling this routine to
283 * ensure that enough bytes remain in the string.
284 *
285 * Results:
286 * *chPtr is filled with the Tcl_UniChar, and the return value is the
287 * number of bytes from the UTF-8 string that were consumed.
288 *
289 * Side effects:
290 * None.
291 *
292 *---------------------------------------------------------------------------
293 */
294
295 int
296 Tcl_UtfToUniChar(str, chPtr)
297 register CONST char *str; /* The UTF-8 string. */
298 register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
299 * by the UTF-8 string. */
300 {
301 register int byte;
302
303 /*
304 * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
305 */
306
307 byte = *((unsigned char *) str);
308 if (byte < 0xC0) {
309 /*
310 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
311 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
312 * characters representing themselves.
313 */
314
315 *chPtr = (Tcl_UniChar) byte;
316 return 1;
317 } else if (byte < 0xE0) {
318 if ((str[1] & 0xC0) == 0x80) {
319 /*
320 * Two-byte-character lead-byte followed by a trail-byte.
321 */
322
323 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
324 return 2;
325 }
326 /*
327 * A two-byte-character lead-byte not followed by trail-byte
328 * represents itself.
329 */
330
331 *chPtr = (Tcl_UniChar) byte;
332 return 1;
333 } else if (byte < 0xF0) {
334 if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
335 /*
336 * Three-byte-character lead byte followed by two trail bytes.
337 */
338
339 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
340 | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
341 return 3;
342 }
343 /*
344 * A three-byte-character lead-byte not followed by two trail-bytes
345 * represents itself.
346 */
347
348 *chPtr = (Tcl_UniChar) byte;
349 return 1;
350 }
351 #if TCL_UTF_MAX > 3
352 else {
353 int ch, total, trail;
354
355 total = totalBytes[byte];
356 trail = total - 1;
357 if (trail > 0) {
358 ch = byte & (0x3F >> trail);
359 do {
360 str++;
361 if ((*str & 0xC0) != 0x80) {
362 *chPtr = byte;
363 return 1;
364 }
365 ch <<= 6;
366 ch |= (*str & 0x3F);
367 trail--;
368 } while (trail > 0);
369 *chPtr = ch;
370 return total;
371 }
372 }
373 #endif
374
375 *chPtr = (Tcl_UniChar) byte;
376 return 1;
377 }
378
379 /*
380 *---------------------------------------------------------------------------
381 *
382 * Tcl_UtfToUniCharDString --
383 *
384 * Convert the UTF-8 string to Unicode.
385 *
386 * Results:
387 * The return value is a pointer to the Unicode representation of the
388 * UTF-8 string. Storage for the return value is appended to the
389 * end of dsPtr. The Unicode string is terminated with a Unicode
390 * NULL character.
391 *
392 * Side effects:
393 * None.
394 *
395 *---------------------------------------------------------------------------
396 */
397
398 Tcl_UniChar *
399 Tcl_UtfToUniCharDString(string, length, dsPtr)
400 CONST char *string; /* UTF-8 string to convert to Unicode. */
401 int length; /* Length of UTF-8 string in bytes, or -1
402 * for strlen(). */
403 Tcl_DString *dsPtr; /* Unicode representation of string is
404 * appended to this previously initialized
405 * DString. */
406 {
407 Tcl_UniChar *w, *wString;
408 CONST char *p, *end;
409 int oldLength;
410
411 if (length < 0) {
412 length = strlen(string);
413 }
414
415 /*
416 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
417 * in bytes.
418 */
419
420 oldLength = Tcl_DStringLength(dsPtr);
421 Tcl_DStringSetLength(dsPtr,
422 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
423 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
424
425 w = wString;
426 end = string + length;
427 for (p = string; p < end; ) {
428 p += Tcl_UtfToUniChar(p, w);
429 w++;
430 }
431 *w = '\0';
432 Tcl_DStringSetLength(dsPtr,
433 (oldLength + ((char *) w - (char *) wString)));
434
435 return wString;
436 }
437
438 /*
439 *---------------------------------------------------------------------------
440 *
441 * Tcl_UtfCharComplete --
442 *
443 * Determine if the UTF-8 string of the given length is long enough
444 * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
445 * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
446 *
447 * Results:
448 * The return value is 0 if the string is not long enough, non-zero
449 * otherwise.
450 *
451 * Side effects:
452 * None.
453 *
454 *---------------------------------------------------------------------------
455 */
456
457 int
458 Tcl_UtfCharComplete(str, len)
459 CONST char *str; /* String to check if first few bytes
460 * contain a complete UTF-8 character. */
461 int len; /* Length of above string in bytes. */
462 {
463 int ch;
464
465 ch = *((unsigned char *) str);
466 return len >= totalBytes[ch];
467 }
468
469 /*
470 *---------------------------------------------------------------------------
471 *
472 * Tcl_NumUtfChars --
473 *
474 * Returns the number of characters (not bytes) in the UTF-8 string,
475 * not including the terminating NULL byte. This is equivalent to
476 * Plan 9 utflen() and utfnlen().
477 *
478 * Results:
479 * As above.
480 *
481 * Side effects:
482 * None.
483 *
484 *---------------------------------------------------------------------------
485 */
486
487 int
488 Tcl_NumUtfChars(str, len)
489 register CONST char *str; /* The UTF-8 string to measure. */
490 int len; /* The length of the string in bytes, or -1
491 * for strlen(string). */
492 {
493 Tcl_UniChar ch;
494 register Tcl_UniChar *chPtr = &ch;
495 register int n;
496 int i;
497
498 /*
499 * The separate implementations are faster.
500 */
501
502 i = 0;
503 if (len < 0) {
504 while (1) {
505 str += Tcl_UtfToUniChar(str, chPtr);
506 if (ch == '\0') {
507 break;
508 }
509 i++;
510 }
511 } else {
512 while (len > 0) {
513 n = Tcl_UtfToUniChar(str, chPtr);
514 len -= n;
515 str += n;
516 i++;
517 }
518 }
519 return i;
520 }
521
522 /*
523 *---------------------------------------------------------------------------
524 *
525 * Tcl_UtfFindFirst --
526 *
527 * Returns a pointer to the first occurance of the given Tcl_UniChar
528 * in the NULL-terminated UTF-8 string. The NULL terminator is
529 * considered part of the UTF-8 string. Equivalent to Plan 9
530 * utfrune().
531 *
532 * Results:
533 * As above. If the Tcl_UniChar does not exist in the given string,
534 * the return value is NULL.
535 *
536 * Side effects:
537 * None.
538 *
539 *---------------------------------------------------------------------------
540 */
541 char *
542 Tcl_UtfFindFirst(string, ch)
543 CONST char *string; /* The UTF-8 string to be searched. */
544 int ch; /* The Tcl_UniChar to search for. */
545 {
546 int len;
547 Tcl_UniChar find;
548
549 while (1) {
550 len = Tcl_UtfToUniChar(string, &find);
551 if (find == ch) {
552 return (char *) string;
553 }
554 if (*string == '\0') {
555 return NULL;
556 }
557 string += len;
558 }
559 }
560
561 /*
562 *---------------------------------------------------------------------------
563 *
564 * Tcl_UtfFindLast --
565 *
566 * Returns a pointer to the last occurance of the given Tcl_UniChar
567 * in the NULL-terminated UTF-8 string. The NULL terminator is
568 * considered part of the UTF-8 string. Equivalent to Plan 9
569 * utfrrune().
570 *
571 * Results:
572 * As above. If the Tcl_UniChar does not exist in the given string,
573 * the return value is NULL.
574 *
575 * Side effects:
576 * None.
577 *
578 *---------------------------------------------------------------------------
579 */
580
581 char *
582 Tcl_UtfFindLast(string, ch)
583 CONST char *string; /* The UTF-8 string to be searched. */
584 int ch; /* The Tcl_UniChar to search for. */
585 {
586 int len;
587 Tcl_UniChar find;
588 CONST char *last;
589
590 last = NULL;
591 while (1) {
592 len = Tcl_UtfToUniChar(string, &find);
593 if (find == ch) {
594 last = string;
595 }
596 if (*string == '\0') {
597 break;
598 }
599 string += len;
600 }
601 return (char *) last;
602 }
603
604 /*
605 *---------------------------------------------------------------------------
606 *
607 * Tcl_UtfNext --
608 *
609 * Given a pointer to some current location in a UTF-8 string,
610 * move forward one character. The caller must ensure that they
611 * are not asking for the next character after the last character
612 * in the string.
613 *
614 * Results:
615 * The return value is the pointer to the next character in
616 * the UTF-8 string.
617 *
618 * Side effects:
619 * None.
620 *
621 *---------------------------------------------------------------------------
622 */
623
624 char *
625 Tcl_UtfNext(str)
626 CONST char *str; /* The current location in the string. */
627 {
628 Tcl_UniChar ch;
629
630 return (char *) str + Tcl_UtfToUniChar(str, &ch);
631 }
632
633 /*
634 *---------------------------------------------------------------------------
635 *
636 * Tcl_UtfPrev --
637 *
638 * Given a pointer to some current location in a UTF-8 string,
639 * move backwards one character.
640 *
641 * Results:
642 * The return value is a pointer to the previous character in the
643 * UTF-8 string. If the current location was already at the
644 * beginning of the string, the return value will also be a
645 * pointer to the beginning of the string.
646 *
647 * Side effects:
648 * None.
649 *
650 *---------------------------------------------------------------------------
651 */
652
653 char *
654 Tcl_UtfPrev(str, start)
655 CONST char *str; /* The current location in the string. */
656 CONST char *start; /* Pointer to the beginning of the
657 * string, to avoid going backwards too
658 * far. */
659 {
660 CONST char *look;
661 int i, byte;
662
663 str--;
664 look = str;
665 for (i = 0; i < TCL_UTF_MAX; i++) {
666 if (look < start) {
667 if (str < start) {
668 str = start;
669 }
670 break;
671 }
672 byte = *((unsigned char *) look);
673 if (byte < 0x80) {
674 break;
675 }
676 if (byte >= 0xC0) {
677 if (totalBytes[byte] != i + 1) {
678 break;
679 }
680 return (char *) look;
681 }
682 look--;
683 }
684 return (char *) str;
685 }
686
687 /*
688 *---------------------------------------------------------------------------
689 *
690 * Tcl_UniCharAtIndex --
691 *
692 * Returns the Unicode character represented at the specified
693 * character (not byte) position in the UTF-8 string.
694 *
695 * Results:
696 * As above.
697 *
698 * Side effects:
699 * None.
700 *
701 *---------------------------------------------------------------------------
702 */
703
704 Tcl_UniChar
705 Tcl_UniCharAtIndex(src, index)
706 register CONST char *src; /* The UTF-8 string to dereference. */
707 register int index; /* The position of the desired character. */
708 {
709 Tcl_UniChar ch;
710
711 while (index >= 0) {
712 index--;
713 src += Tcl_UtfToUniChar(src, &ch);
714 }
715 return ch;
716 }
717
718 /*
719 *---------------------------------------------------------------------------
720 *
721 * Tcl_UtfAtIndex --
722 *
723 * Returns a pointer to the specified character (not byte) position
724 * in the UTF-8 string.
725 *
726 * Results:
727 * As above.
728 *
729 * Side effects:
730 * None.
731 *
732 *---------------------------------------------------------------------------
733 */
734
735 char *
736 Tcl_UtfAtIndex(src, index)
737 register CONST char *src; /* The UTF-8 string. */
738 register int index; /* The position of the desired character. */
739 {
740 Tcl_UniChar ch;
741
742 while (index > 0) {
743 index--;
744 src += Tcl_UtfToUniChar(src, &ch);
745 }
746 return (char *) src;
747 }
748
749 /*
750 *---------------------------------------------------------------------------
751 *
752 * Tcl_UtfBackslash --
753 *
754 * Figure out how to handle a backslash sequence.
755 *
756 * Results:
757 * Stores the bytes represented by the backslash sequence in dst and
758 * returns the number of bytes written to dst. At most TCL_UTF_MAX
759 * bytes are written to dst; dst must have been large enough to accept
760 * those bytes. If readPtr isn't NULL then it is filled in with a
761 * count of the number of bytes in the backslash sequence.
762 *
763 * Side effects:
764 * The maximum number of bytes it takes to represent a Unicode
765 * character in UTF-8 is guaranteed to be less than the number of
766 * bytes used to express the backslash sequence that represents
767 * that Unicode character. If the target buffer into which the
768 * caller is going to store the bytes that represent the Unicode
769 * character is at least as large as the source buffer from which
770 * the backslashed sequence was extracted, no buffer overruns should
771 * occur.
772 *
773 *---------------------------------------------------------------------------
774 */
775
776 int
777 Tcl_UtfBackslash(src, readPtr, dst)
778 CONST char *src; /* Points to the backslash character of
779 * a backslash sequence. */
780 int *readPtr; /* Fill in with number of characters read
781 * from src, unless NULL. */
782 char *dst; /* Filled with the bytes represented by the
783 * backslash sequence. */
784 {
785 register CONST char *p = src+1;
786 int result, count, n;
787 char buf[TCL_UTF_MAX];
788
789 if (dst == NULL) {
790 dst = buf;
791 }
792
793 count = 2;
794 switch (*p) {
795 /*
796 * Note: in the conversions below, use absolute values (e.g.,
797 * 0xa) rather than symbolic values (e.g. \n) that get converted
798 * by the compiler. It's possible that compilers on some
799 * platforms will do the symbolic conversions differently, which
800 * could result in non-portable Tcl scripts.
801 */
802
803 case 'a':
804 result = 0x7;
805 break;
806 case 'b':
807 result = 0x8;
808 break;
809 case 'f':
810 result = 0xc;
811 break;
812 case 'n':
813 result = 0xa;
814 break;
815 case 'r':
816 result = 0xd;
817 break;
818 case 't':
819 result = 0x9;
820 break;
821 case 'v':
822 result = 0xb;
823 break;
824 case 'x':
825 if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
826 char *end;
827
828 result = (unsigned char) strtoul(p+1, &end, 16);
829 count = end - src;
830 } else {
831 count = 2;
832 result = 'x';
833 }
834 break;
835 case 'u':
836 result = 0;
837 for (count = 0; count < 4; count++) {
838 p++;
839 if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
840 break;
841 }
842 n = *p - '0';
843 if (n > 9) {
844 n = n + '0' + 10 - 'A';
845 }
846 if (n > 16) {
847 n = n + 'A' - 'a';
848 }
849 result = (result << 4) + n;
850 }
851 if (count == 0) {
852 result = 'u';
853 }
854 count += 2;
855 break;
856
857 case '\n':
858 do {
859 p++;
860 } while ((*p == ' ') || (*p == '\t'));
861 result = ' ';
862 count = p - src;
863 break;
864 case 0:
865 result = '\\';
866 count = 1;
867 break;
868 default:
869 /*
870 * Check for an octal number \oo?o?
871 */
872 if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
873 result = (unsigned char)(*p - '0');
874 p++;
875 if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
876 break;
877 }
878 count = 3;
879 result = (unsigned char)((result << 3) + (*p - '0'));
880 p++;
881 if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
882 break;
883 }
884 count = 4;
885 result = (unsigned char)((result << 3) + (*p - '0'));
886 break;
887 }
888 result = *p;
889 count = 2;
890 break;
891 }
892
893 if (readPtr != NULL) {
894 *readPtr = count;
895 }
896 return Tcl_UniCharToUtf(result, dst);
897 }
898
899 /*
900 *----------------------------------------------------------------------
901 *
902 * Tcl_UtfToUpper --
903 *
904 * Convert lowercase characters to uppercase characters in a UTF
905 * string in place. The conversion may shrink the UTF string.
906 *
907 * Results:
908 * Returns the number of bytes in the resulting string
909 * excluding the trailing null.
910 *
911 * Side effects:
912 * Writes a terminating null after the last converted character.
913 *
914 *----------------------------------------------------------------------
915 */
916
917 int
918 Tcl_UtfToUpper(str)
919 char *str; /* String to convert in place. */
920 {
921 Tcl_UniChar ch, upChar;
922 char *src, *dst;
923 int bytes;
924
925 /*
926 * Iterate over the string until we hit the terminating null.
927 */
928
929 src = dst = str;
930 while (*src) {
931 bytes = Tcl_UtfToUniChar(src, &ch);
932 upChar = Tcl_UniCharToUpper(ch);
933
934 /*
935 * To keep badly formed Utf strings from getting inflated by
936 * the conversion (thereby causing a segfault), only copy the
937 * upper case char to dst if its size is <= the original char.
938 */
939
940 if (bytes < UtfCount(upChar)) {
941 memcpy(dst, src, (size_t) bytes);
942 dst += bytes;
943 } else {
944 dst += Tcl_UniCharToUtf(upChar, dst);
945 }
946 src += bytes;
947 }
948 *dst = '\0';
949 return (dst - str);
950 }
951
952 /*
953 *----------------------------------------------------------------------
954 *
955 * Tcl_UtfToLower --
956 *
957 * Convert uppercase characters to lowercase characters in a UTF
958 * string in place. The conversion may shrink the UTF string.
959 *
960 * Results:
961 * Returns the number of bytes in the resulting string
962 * excluding the trailing null.
963 *
964 * Side effects:
965 * Writes a terminating null after the last converted character.
966 *
967 *----------------------------------------------------------------------
968 */
969
970 int
971 Tcl_UtfToLower(str)
972 char *str; /* String to convert in place. */
973 {
974 Tcl_UniChar ch, lowChar;
975 char *src, *dst;
976 int bytes;
977
978 /*
979 * Iterate over the string until we hit the terminating null.
980 */
981
982 src = dst = str;
983 while (*src) {
984 bytes = Tcl_UtfToUniChar(src, &ch);
985 lowChar = Tcl_UniCharToLower(ch);
986
987 /*
988 * To keep badly formed Utf strings from getting inflated by
989 * the conversion (thereby causing a segfault), only copy the
990 * lower case char to dst if its size is <= the original char.
991 */
992
993 if (bytes < UtfCount(lowChar)) {
994 memcpy(dst, src, (size_t) bytes);
995 dst += bytes;
996 } else {
997 dst += Tcl_UniCharToUtf(lowChar, dst);
998 }
999 src += bytes;
1000 }
1001 *dst = '\0';
1002 return (dst - str);
1003 }
1004
1005 /*
1006 *----------------------------------------------------------------------
1007 *
1008 * Tcl_UtfToTitle --
1009 *
1010 * Changes the first character of a UTF string to title case or
1011 * uppercase and the rest of the string to lowercase. The
1012 * conversion happens in place and may shrink the UTF string.
1013 *
1014 * Results:
1015 * Returns the number of bytes in the resulting string
1016 * excluding the trailing null.
1017 *
1018 * Side effects:
1019 * Writes a terminating null after the last converted character.
1020 *
1021 *----------------------------------------------------------------------
1022 */
1023
1024 int
1025 Tcl_UtfToTitle(str)
1026 char *str; /* String to convert in place. */
1027 {
1028 Tcl_UniChar ch, titleChar, lowChar;
1029 char *src, *dst;
1030 int bytes;
1031
1032 /*
1033 * Capitalize the first character and then lowercase the rest of the
1034 * characters until we get to a null.
1035 */
1036
1037 src = dst = str;
1038
1039 if (*src) {
1040 bytes = Tcl_UtfToUniChar(src, &ch);
1041 titleChar = Tcl_UniCharToTitle(ch);
1042
1043 if (bytes < UtfCount(titleChar)) {
1044 memcpy(dst, src, (size_t) bytes);
1045 dst += bytes;
1046 } else {
1047 dst += Tcl_UniCharToUtf(titleChar, dst);
1048 }
1049 src += bytes;
1050 }
1051 while (*src) {
1052 bytes = Tcl_UtfToUniChar(src, &ch);
1053 lowChar = Tcl_UniCharToLower(ch);
1054
1055 if (bytes < UtfCount(lowChar)) {
1056 memcpy(dst, src, (size_t) bytes);
1057 dst += bytes;
1058 } else {
1059 dst += Tcl_UniCharToUtf(lowChar, dst);
1060 }
1061 src += bytes;
1062 }
1063 *dst = '\0';
1064 return (dst - str);
1065 }
1066
1067 /*
1068 *----------------------------------------------------------------------
1069 *
1070 * Tcl_UtfNcmp --
1071 *
1072 * Compare at most n UTF chars of string cs to string ct. Both cs
1073 * and ct are assumed to be at least n UTF chars long.
1074 *
1075 * Results:
1076 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1077 *
1078 * Side effects:
1079 * None.
1080 *
1081 *----------------------------------------------------------------------
1082 */
1083
1084 int
1085 Tcl_UtfNcmp(cs, ct, n)
1086 CONST char *cs; /* UTF string to compare to ct. */
1087 CONST char *ct; /* UTF string cs is compared to. */
1088 unsigned long n; /* Number of UTF chars to compare. */
1089 {
1090 Tcl_UniChar ch1, ch2;
1091 /*
1092 * Another approach that should work is:
1093 * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
1094 * That assumes that ct is a properly formed UTF, so we will just
1095 * be comparing the bytes that compromise those strings to the
1096 * char length n.
1097 */
1098 while (n-- > 0) {
1099 /*
1100 * n must be interpreted as chars, not bytes.
1101 * This should be called only when both strings are of
1102 * at least n chars long (no need for \0 check)
1103 */
1104 cs += Tcl_UtfToUniChar(cs, &ch1);
1105 ct += Tcl_UtfToUniChar(ct, &ch2);
1106 if (ch1 != ch2) {
1107 return (ch1 - ch2);
1108 }
1109 }
1110 return 0;
1111 }
1112
1113 /*
1114 *----------------------------------------------------------------------
1115 *
1116 * Tcl_UtfNcasecmp --
1117 *
1118 * Compare at most n UTF chars of string cs to string ct case
1119 * insensitive. Both cs and ct are assumed to be at least n
1120 * UTF chars long.
1121 *
1122 * Results:
1123 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1124 *
1125 * Side effects:
1126 * None.
1127 *
1128 *----------------------------------------------------------------------
1129 */
1130
1131 int
1132 Tcl_UtfNcasecmp(cs, ct, n)
1133 CONST char *cs; /* UTF string to compare to ct. */
1134 CONST char *ct; /* UTF string cs is compared to. */
1135 unsigned long n; /* Number of UTF chars to compare. */
1136 {
1137 Tcl_UniChar ch1, ch2;
1138 while (n-- > 0) {
1139 /*
1140 * n must be interpreted as chars, not bytes.
1141 * This should be called only when both strings are of
1142 * at least n chars long (no need for \0 check)
1143 */
1144 cs += Tcl_UtfToUniChar(cs, &ch1);
1145 ct += Tcl_UtfToUniChar(ct, &ch2);
1146 if (ch1 != ch2) {
1147 ch1 = Tcl_UniCharToLower(ch1);
1148 ch2 = Tcl_UniCharToLower(ch2);
1149 if (ch1 != ch2) {
1150 return (ch1 - ch2);
1151 }
1152 }
1153 }
1154 return 0;
1155 }
1156
1157 /*
1158 *----------------------------------------------------------------------
1159 *
1160 * Tcl_UniCharToUpper --
1161 *
1162 * Compute the uppercase equivalent of the given Unicode character.
1163 *
1164 * Results:
1165 * Returns the uppercase Unicode character.
1166 *
1167 * Side effects:
1168 * None.
1169 *
1170 *----------------------------------------------------------------------
1171 */
1172
1173 Tcl_UniChar
1174 Tcl_UniCharToUpper(ch)
1175 int ch; /* Unicode character to convert. */
1176 {
1177 int info = GetUniCharInfo(ch);
1178
1179 if (GetCaseType(info) & 0x04) {
1180 return (Tcl_UniChar) (ch - GetDelta(info));
1181 } else {
1182 return ch;
1183 }
1184 }
1185
1186 /*
1187 *----------------------------------------------------------------------
1188 *
1189 * Tcl_UniCharToLower --
1190 *
1191 * Compute the lowercase equivalent of the given Unicode character.
1192 *
1193 * Results:
1194 * Returns the lowercase Unicode character.
1195 *
1196 * Side effects:
1197 * None.
1198 *
1199 *----------------------------------------------------------------------
1200 */
1201
1202 Tcl_UniChar
1203 Tcl_UniCharToLower(ch)
1204 int ch; /* Unicode character to convert. */
1205 {
1206 int info = GetUniCharInfo(ch);
1207
1208 if (GetCaseType(info) & 0x02) {
1209 return (Tcl_UniChar) (ch + GetDelta(info));
1210 } else {
1211 return ch;
1212 }
1213 }
1214
1215 /*
1216 *----------------------------------------------------------------------
1217 *
1218 * Tcl_UniCharToTitle --
1219 *
1220 * Compute the titlecase equivalent of the given Unicode character.
1221 *
1222 * Results:
1223 * Returns the titlecase Unicode character.
1224 *
1225 * Side effects:
1226 * None.
1227 *
1228 *----------------------------------------------------------------------
1229 */
1230
1231 Tcl_UniChar
1232 Tcl_UniCharToTitle(ch)
1233 int ch; /* Unicode character to convert. */
1234 {
1235 int info = GetUniCharInfo(ch);
1236 int mode = GetCaseType(info);
1237
1238 if (mode & 0x1) {
1239 /*
1240 * Subtract or add one depending on the original case.
1241 */
1242
1243 return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1244 } else if (mode == 0x4) {
1245 return (Tcl_UniChar) (ch - GetDelta(info));
1246 } else {
1247 return ch;
1248 }
1249 }
1250
1251 /*
1252 *----------------------------------------------------------------------
1253 *
1254 * Tcl_UniCharLen --
1255 *
1256 * Find the length of a UniChar string. The str input must be null
1257 * terminated.
1258 *
1259 * Results:
1260 * Returns the length of str in UniChars (not bytes).
1261 *
1262 * Side effects:
1263 * None.
1264 *
1265 *----------------------------------------------------------------------
1266 */
1267
1268 int
1269 Tcl_UniCharLen(str)
1270 Tcl_UniChar *str; /* Unicode string to find length of. */
1271 {
1272 int len = 0;
1273
1274 while (*str != '\0') {
1275 len++;
1276 str++;
1277 }
1278 return len;
1279 }
1280
1281 /*
1282 *----------------------------------------------------------------------
1283 *
1284 * Tcl_UniCharNcmp --
1285 *
1286 * Compare at most n unichars of string cs to string ct. Both cs
1287 * and ct are assumed to be at least n unichars long.
1288 *
1289 * Results:
1290 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1291 *
1292 * Side effects:
1293 * None.
1294 *
1295 *----------------------------------------------------------------------
1296 */
1297
1298 int
1299 Tcl_UniCharNcmp(cs, ct, n)
1300 CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1301 CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1302 unsigned long n; /* Number of unichars to compare. */
1303 {
1304 for ( ; n != 0; n--, cs++, ct++) {
1305 if (*cs != *ct) {
1306 return *cs - *ct;
1307 }
1308 if (*cs == '\0') {
1309 break;
1310 }
1311 }
1312 return 0;
1313 }
1314
1315 /*
1316 *----------------------------------------------------------------------
1317 *
1318 * Tcl_UniCharIsAlnum --
1319 *
1320 * Test if a character is an alphanumeric Unicode character.
1321 *
1322 * Results:
1323 * Returns 1 if character is alphanumeric.
1324 *
1325 * Side effects:
1326 * None.
1327 *
1328 *----------------------------------------------------------------------
1329 */
1330
1331 int
1332 Tcl_UniCharIsAlnum(ch)
1333 int ch; /* Unicode character to test. */
1334 {
1335 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1336
1337 return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1338 }
1339
1340 /*
1341 *----------------------------------------------------------------------
1342 *
1343 * Tcl_UniCharIsAlpha --
1344 *
1345 * Test if a character is an alphabetic Unicode character.
1346 *
1347 * Results:
1348 * Returns 1 if character is alphabetic.
1349 *
1350 * Side effects:
1351 * None.
1352 *
1353 *----------------------------------------------------------------------
1354 */
1355
1356 int
1357 Tcl_UniCharIsAlpha(ch)
1358 int ch; /* Unicode character to test. */
1359 {
1360 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1361 return ((ALPHA_BITS >> category) & 1);
1362 }
1363
1364 /*
1365 *----------------------------------------------------------------------
1366 *
1367 * Tcl_UniCharIsControl --
1368 *
1369 * Test if a character is a Unicode control character.
1370 *
1371 * Results:
1372 * Returns non-zero if character is a control.
1373 *
1374 * Side effects:
1375 * None.
1376 *
1377 *----------------------------------------------------------------------
1378 */
1379
1380 int
1381 Tcl_UniCharIsControl(ch)
1382 int ch; /* Unicode character to test. */
1383 {
1384 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1385 }
1386
1387 /*
1388 *----------------------------------------------------------------------
1389 *
1390 * Tcl_UniCharIsDigit --
1391 *
1392 * Test if a character is a numeric Unicode character.
1393 *
1394 * Results:
1395 * Returns non-zero if character is a digit.
1396 *
1397 * Side effects:
1398 * None.
1399 *
1400 *----------------------------------------------------------------------
1401 */
1402
1403 int
1404 Tcl_UniCharIsDigit(ch)
1405 int ch; /* Unicode character to test. */
1406 {
1407 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1408 == DECIMAL_DIGIT_NUMBER);
1409 }
1410
1411 /*
1412 *----------------------------------------------------------------------
1413 *
1414 * Tcl_UniCharIsGraph --
1415 *
1416 * Test if a character is any Unicode print character except space.
1417 *
1418 * Results:
1419 * Returns non-zero if character is printable, but not space.
1420 *
1421 * Side effects:
1422 * None.
1423 *
1424 *----------------------------------------------------------------------
1425 */
1426
1427 int
1428 Tcl_UniCharIsGraph(ch)
1429 int ch; /* Unicode character to test. */
1430 {
1431 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1432 return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1433 }
1434
1435 /*
1436 *----------------------------------------------------------------------
1437 *
1438 * Tcl_UniCharIsLower --
1439 *
1440 * Test if a character is a lowercase Unicode character.
1441 *
1442 * Results:
1443 * Returns non-zero if character is lowercase.
1444 *
1445 * Side effects:
1446 * None.
1447 *
1448 *----------------------------------------------------------------------
1449 */
1450
1451 int
1452 Tcl_UniCharIsLower(ch)
1453 int ch; /* Unicode character to test. */
1454 {
1455 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1456 }
1457
1458 /*
1459 *----------------------------------------------------------------------
1460 *
1461 * Tcl_UniCharIsPrint --
1462 *
1463 * Test if a character is a Unicode print character.
1464 *
1465 * Results:
1466 * Returns non-zero if character is printable.
1467 *
1468 * Side effects:
1469 * None.
1470 *
1471 *----------------------------------------------------------------------
1472 */
1473
1474 int
1475 Tcl_UniCharIsPrint(ch)
1476 int ch; /* Unicode character to test. */
1477 {
1478 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1479 return ((PRINT_BITS >> category) & 1);
1480 }
1481
1482 /*
1483 *----------------------------------------------------------------------
1484 *
1485 * Tcl_UniCharIsPunct --
1486 *
1487 * Test if a character is a Unicode punctuation character.
1488 *
1489 * Results:
1490 * Returns non-zero if character is punct.
1491 *
1492 * Side effects:
1493 * None.
1494 *
1495 *----------------------------------------------------------------------
1496 */
1497
1498 int
1499 Tcl_UniCharIsPunct(ch)
1500 int ch; /* Unicode character to test. */
1501 {
1502 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1503 return ((PUNCT_BITS >> category) & 1);
1504 }
1505
1506 /*
1507 *----------------------------------------------------------------------
1508 *
1509 * Tcl_UniCharIsSpace --
1510 *
1511 * Test if a character is a whitespace Unicode character.
1512 *
1513 * Results:
1514 * Returns non-zero if character is a space.
1515 *
1516 * Side effects:
1517 * None.
1518 *
1519 *----------------------------------------------------------------------
1520 */
1521
1522 int
1523 Tcl_UniCharIsSpace(ch)
1524 int ch; /* Unicode character to test. */
1525 {
1526 register int category;
1527
1528 /*
1529 * If the character is within the first 127 characters, just use the
1530 * standard C function, otherwise consult the Unicode table.
1531 */
1532
1533 if (ch < 0x80) {
1534 return isspace(UCHAR(ch)); /* INTL: ISO space */
1535 } else {
1536 category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1537 return ((SPACE_BITS >> category) & 1);
1538 }
1539 }
1540
1541 /*
1542 *----------------------------------------------------------------------
1543 *
1544 * Tcl_UniCharIsUpper --
1545 *
1546 * Test if a character is a uppercase Unicode character.
1547 *
1548 * Results:
1549 * Returns non-zero if character is uppercase.
1550 *
1551 * Side effects:
1552 * None.
1553 *
1554 *----------------------------------------------------------------------
1555 */
1556
1557 int
1558 Tcl_UniCharIsUpper(ch)
1559 int ch; /* Unicode character to test. */
1560 {
1561 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1562 }
1563
1564 /*
1565 *----------------------------------------------------------------------
1566 *
1567 * Tcl_UniCharIsWordChar --
1568 *
1569 * Test if a character is alphanumeric or a connector punctuation
1570 * mark.
1571 *
1572 * Results:
1573 * Returns 1 if character is a word character.
1574 *
1575 * Side effects:
1576 * None.
1577 *
1578 *----------------------------------------------------------------------
1579 */
1580
1581 int
1582 Tcl_UniCharIsWordChar(ch)
1583 int ch; /* Unicode character to test. */
1584 {
1585 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1586
1587 return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1588 }
1589
1590
1591 /* $History: tclutf.c $
1592 *
1593 * ***************** Version 1 *****************
1594 * User: Dtashley Date: 1/02/01 Time: 1:05a
1595 * Created in $/IjuScripter, IjuConsole/Source/Tcl Base
1596 * Initial check-in.
1597 */
1598
1599 /* End of TCL_UTF.C */

Properties

Name Value
svn:keywords Header

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25