/[dtapublic]/projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c
ViewVC logotype

Annotation of /projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 67 - (hide annotations) (download)
Mon Oct 31 00:57:34 2016 UTC (7 years, 7 months ago) by dashley
File MIME type: text/plain
File size: 39491 byte(s)
Header and footer cleanup.
1 dashley 64 /* $Header$ */
2 dashley 25 /*
3     * tclUtf.c --
4     *
5     * Routines for manipulating UTF-8 strings.
6     *
7     * Copyright (c) 1997-1998 Sun Microsystems, Inc.
8     *
9     * See the file "license.terms" for information on usage and redistribution
10     * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
11     *
12     * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $
13     */
14    
15     #include "tclInt.h"
16    
17     /*
18     * Include the static character classification tables and macros.
19     */
20    
21     #include "tclUniData.c"
22    
23     /*
24     * The following macros are used for fast character category tests. The
25     * x_BITS values are shifted right by the category value to determine whether
26     * the given category is included in the set.
27     */
28    
29     #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
30     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
31    
32     #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
33    
34     #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
35     | (1 << PARAGRAPH_SEPARATOR))
36    
37     #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
38    
39     #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
40     (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
41     (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
42     (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
43     (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
44     (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
45     (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
46     (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
47     (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
48    
49     #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
50     (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
51     (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
52     (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
53    
54     /*
55     * Unicode characters less than this value are represented by themselves
56     * in UTF-8 strings.
57     */
58    
59     #define UNICODE_SELF 0x80
60    
61     /*
62     * The following structures are used when mapping between Unicode (UCS-2)
63     * and UTF-8.
64     */
65    
66     CONST unsigned char totalBytes[256] = {
67     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
74     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
75     #if TCL_UTF_MAX > 3
76     4,4,4,4,4,4,4,4,
77     #else
78     1,1,1,1,1,1,1,1,
79     #endif
80     #if TCL_UTF_MAX > 4
81     5,5,5,5,
82     #else
83     1,1,1,1,
84     #endif
85     #if TCL_UTF_MAX > 5
86     6,6,6,6
87     #else
88     1,1,1,1
89     #endif
90     };
91    
92     /*
93     * Procedures used only in this module.
94     */
95    
96     static int UtfCount _ANSI_ARGS_((int ch));
97    
98    
99     /*
100     *---------------------------------------------------------------------------
101     *
102     * UtfCount --
103     *
104     * Find the number of bytes in the Utf character "ch".
105     *
106     * Results:
107     * The return values is the number of bytes in the Utf character "ch".
108     *
109     * Side effects:
110     * None.
111     *
112     *---------------------------------------------------------------------------
113     */
114    
115     static int
116     UtfCount(ch)
117     int ch; /* The Tcl_UniChar whose size is returned. */
118     {
119     if ((ch > 0) && (ch < UNICODE_SELF)) {
120     return 1;
121     }
122     if (ch <= 0x7FF) {
123     return 2;
124     }
125     if (ch <= 0xFFFF) {
126     return 3;
127     }
128     #if TCL_UTF_MAX > 3
129     if (ch <= 0x1FFFFF) {
130     return 4;
131     }
132     if (ch <= 0x3FFFFFF) {
133     return 5;
134     }
135     if (ch <= 0x7FFFFFFF) {
136     return 6;
137     }
138     #endif
139     return 3;
140     }
141    
142     /*
143     *---------------------------------------------------------------------------
144     *
145     * Tcl_UniCharToUtf --
146     *
147     * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
148     * provided buffer. Equivalent to Plan 9 runetochar().
149     *
150     * Results:
151     * The return values is the number of bytes in the buffer that
152     * were consumed.
153     *
154     * Side effects:
155     * None.
156     *
157     *---------------------------------------------------------------------------
158     */
159    
160     INLINE int
161     Tcl_UniCharToUtf(ch, str)
162     int ch; /* The Tcl_UniChar to be stored in the
163     * buffer. */
164     char *str; /* Buffer in which the UTF-8 representation
165     * of the Tcl_UniChar is stored. Buffer must
166     * be large enough to hold the UTF-8 character
167     * (at most TCL_UTF_MAX bytes). */
168     {
169     if ((ch > 0) && (ch < UNICODE_SELF)) {
170     str[0] = (char) ch;
171     return 1;
172     }
173     if (ch <= 0x7FF) {
174     str[1] = (char) ((ch | 0x80) & 0xBF);
175     str[0] = (char) ((ch >> 6) | 0xC0);
176     return 2;
177     }
178     if (ch <= 0xFFFF) {
179     three:
180     str[2] = (char) ((ch | 0x80) & 0xBF);
181     str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
182     str[0] = (char) ((ch >> 12) | 0xE0);
183     return 3;
184     }
185    
186     #if TCL_UTF_MAX > 3
187     if (ch <= 0x1FFFFF) {
188     str[3] = (char) ((ch | 0x80) & 0xBF);
189     str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
190     str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
191     str[0] = (char) ((ch >> 18) | 0xF0);
192     return 4;
193     }
194     if (ch <= 0x3FFFFFF) {
195     str[4] = (char) ((ch | 0x80) & 0xBF);
196     str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
197     str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
198     str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
199     str[0] = (char) ((ch >> 24) | 0xF8);
200     return 5;
201     }
202     if (ch <= 0x7FFFFFFF) {
203     str[5] = (char) ((ch | 0x80) & 0xBF);
204     str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
205     str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
206     str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
207     str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
208     str[0] = (char) ((ch >> 30) | 0xFC);
209     return 6;
210     }
211     #endif
212    
213     ch = 0xFFFD;
214     goto three;
215     }
216    
217     /*
218     *---------------------------------------------------------------------------
219     *
220     * Tcl_UniCharToUtfDString --
221     *
222     * Convert the given Unicode string to UTF-8.
223     *
224     * Results:
225     * The return value is a pointer to the UTF-8 representation of the
226     * Unicode string. Storage for the return value is appended to the
227     * end of dsPtr.
228     *
229     * Side effects:
230     * None.
231     *
232     *---------------------------------------------------------------------------
233     */
234    
235     char *
236     Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
237     CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
238     int numChars; /* Length of Unicode string in Tcl_UniChars
239     * (must be >= 0). */
240     Tcl_DString *dsPtr; /* UTF-8 representation of string is
241     * appended to this previously initialized
242     * DString. */
243     {
244     CONST Tcl_UniChar *w, *wEnd;
245     char *p, *string;
246     int oldLength;
247    
248     /*
249     * UTF-8 string length in bytes will be <= Unicode string length *
250     * TCL_UTF_MAX.
251     */
252    
253     oldLength = Tcl_DStringLength(dsPtr);
254     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
255     string = Tcl_DStringValue(dsPtr) + oldLength;
256    
257     p = string;
258     wEnd = wString + numChars;
259     for (w = wString; w < wEnd; ) {
260     p += Tcl_UniCharToUtf(*w, p);
261     w++;
262     }
263     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
264    
265     return string;
266     }
267    
268     /*
269     *---------------------------------------------------------------------------
270     *
271     * Tcl_UtfToUniChar --
272     *
273     * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
274     * UTF-8 sequences are converted to valid Tcl_UniChars and processing
275     * continues. Equivalent to Plan 9 chartorune().
276     *
277     * The caller must ensure that the source buffer is long enough that
278     * this routine does not run off the end and dereference non-existent
279     * memory looking for trail bytes. If the source buffer is known to
280     * be '\0' terminated, this cannot happen. Otherwise, the caller
281     * should call Tcl_UtfCharComplete() before calling this routine to
282     * ensure that enough bytes remain in the string.
283     *
284     * Results:
285     * *chPtr is filled with the Tcl_UniChar, and the return value is the
286     * number of bytes from the UTF-8 string that were consumed.
287     *
288     * Side effects:
289     * None.
290     *
291     *---------------------------------------------------------------------------
292     */
293    
294     int
295     Tcl_UtfToUniChar(str, chPtr)
296     register CONST char *str; /* The UTF-8 string. */
297     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
298     * by the UTF-8 string. */
299     {
300     register int byte;
301    
302     /*
303     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
304     */
305    
306     byte = *((unsigned char *) str);
307     if (byte < 0xC0) {
308     /*
309     * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
310     * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
311     * characters representing themselves.
312     */
313    
314     *chPtr = (Tcl_UniChar) byte;
315     return 1;
316     } else if (byte < 0xE0) {
317     if ((str[1] & 0xC0) == 0x80) {
318     /*
319     * Two-byte-character lead-byte followed by a trail-byte.
320     */
321    
322     *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
323     return 2;
324     }
325     /*
326     * A two-byte-character lead-byte not followed by trail-byte
327     * represents itself.
328     */
329    
330     *chPtr = (Tcl_UniChar) byte;
331     return 1;
332     } else if (byte < 0xF0) {
333     if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
334     /*
335     * Three-byte-character lead byte followed by two trail bytes.
336     */
337    
338     *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339     | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
340     return 3;
341     }
342     /*
343     * A three-byte-character lead-byte not followed by two trail-bytes
344     * represents itself.
345     */
346    
347     *chPtr = (Tcl_UniChar) byte;
348     return 1;
349     }
350     #if TCL_UTF_MAX > 3
351     else {
352     int ch, total, trail;
353    
354     total = totalBytes[byte];
355     trail = total - 1;
356     if (trail > 0) {
357     ch = byte & (0x3F >> trail);
358     do {
359     str++;
360     if ((*str & 0xC0) != 0x80) {
361     *chPtr = byte;
362     return 1;
363     }
364     ch <<= 6;
365     ch |= (*str & 0x3F);
366     trail--;
367     } while (trail > 0);
368     *chPtr = ch;
369     return total;
370     }
371     }
372     #endif
373    
374     *chPtr = (Tcl_UniChar) byte;
375     return 1;
376     }
377    
378     /*
379     *---------------------------------------------------------------------------
380     *
381     * Tcl_UtfToUniCharDString --
382     *
383     * Convert the UTF-8 string to Unicode.
384     *
385     * Results:
386     * The return value is a pointer to the Unicode representation of the
387     * UTF-8 string. Storage for the return value is appended to the
388     * end of dsPtr. The Unicode string is terminated with a Unicode
389     * NULL character.
390     *
391     * Side effects:
392     * None.
393     *
394     *---------------------------------------------------------------------------
395     */
396    
397     Tcl_UniChar *
398     Tcl_UtfToUniCharDString(string, length, dsPtr)
399     CONST char *string; /* UTF-8 string to convert to Unicode. */
400     int length; /* Length of UTF-8 string in bytes, or -1
401     * for strlen(). */
402     Tcl_DString *dsPtr; /* Unicode representation of string is
403     * appended to this previously initialized
404     * DString. */
405     {
406     Tcl_UniChar *w, *wString;
407     CONST char *p, *end;
408     int oldLength;
409    
410     if (length < 0) {
411     length = strlen(string);
412     }
413    
414     /*
415     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
416     * in bytes.
417     */
418    
419     oldLength = Tcl_DStringLength(dsPtr);
420     Tcl_DStringSetLength(dsPtr,
421     (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423    
424     w = wString;
425     end = string + length;
426     for (p = string; p < end; ) {
427     p += Tcl_UtfToUniChar(p, w);
428     w++;
429     }
430     *w = '\0';
431     Tcl_DStringSetLength(dsPtr,
432     (oldLength + ((char *) w - (char *) wString)));
433    
434     return wString;
435     }
436    
437     /*
438     *---------------------------------------------------------------------------
439     *
440     * Tcl_UtfCharComplete --
441     *
442     * Determine if the UTF-8 string of the given length is long enough
443     * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
444     * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
445     *
446     * Results:
447     * The return value is 0 if the string is not long enough, non-zero
448     * otherwise.
449     *
450     * Side effects:
451     * None.
452     *
453     *---------------------------------------------------------------------------
454     */
455    
456     int
457     Tcl_UtfCharComplete(str, len)
458     CONST char *str; /* String to check if first few bytes
459     * contain a complete UTF-8 character. */
460     int len; /* Length of above string in bytes. */
461     {
462     int ch;
463    
464     ch = *((unsigned char *) str);
465     return len >= totalBytes[ch];
466     }
467    
468     /*
469     *---------------------------------------------------------------------------
470     *
471     * Tcl_NumUtfChars --
472     *
473     * Returns the number of characters (not bytes) in the UTF-8 string,
474     * not including the terminating NULL byte. This is equivalent to
475     * Plan 9 utflen() and utfnlen().
476     *
477     * Results:
478     * As above.
479     *
480     * Side effects:
481     * None.
482     *
483     *---------------------------------------------------------------------------
484     */
485    
486     int
487     Tcl_NumUtfChars(str, len)
488     register CONST char *str; /* The UTF-8 string to measure. */
489     int len; /* The length of the string in bytes, or -1
490     * for strlen(string). */
491     {
492     Tcl_UniChar ch;
493     register Tcl_UniChar *chPtr = &ch;
494     register int n;
495     int i;
496    
497     /*
498     * The separate implementations are faster.
499     */
500    
501     i = 0;
502     if (len < 0) {
503     while (1) {
504     str += Tcl_UtfToUniChar(str, chPtr);
505     if (ch == '\0') {
506     break;
507     }
508     i++;
509     }
510     } else {
511     while (len > 0) {
512     n = Tcl_UtfToUniChar(str, chPtr);
513     len -= n;
514     str += n;
515     i++;
516     }
517     }
518     return i;
519     }
520    
521     /*
522     *---------------------------------------------------------------------------
523     *
524     * Tcl_UtfFindFirst --
525     *
526     * Returns a pointer to the first occurance of the given Tcl_UniChar
527     * in the NULL-terminated UTF-8 string. The NULL terminator is
528     * considered part of the UTF-8 string. Equivalent to Plan 9
529     * utfrune().
530     *
531     * Results:
532     * As above. If the Tcl_UniChar does not exist in the given string,
533     * the return value is NULL.
534     *
535     * Side effects:
536     * None.
537     *
538     *---------------------------------------------------------------------------
539     */
540     char *
541     Tcl_UtfFindFirst(string, ch)
542     CONST char *string; /* The UTF-8 string to be searched. */
543     int ch; /* The Tcl_UniChar to search for. */
544     {
545     int len;
546     Tcl_UniChar find;
547    
548     while (1) {
549     len = Tcl_UtfToUniChar(string, &find);
550     if (find == ch) {
551     return (char *) string;
552     }
553     if (*string == '\0') {
554     return NULL;
555     }
556     string += len;
557     }
558     }
559    
560     /*
561     *---------------------------------------------------------------------------
562     *
563     * Tcl_UtfFindLast --
564     *
565     * Returns a pointer to the last occurance of the given Tcl_UniChar
566     * in the NULL-terminated UTF-8 string. The NULL terminator is
567     * considered part of the UTF-8 string. Equivalent to Plan 9
568     * utfrrune().
569     *
570     * Results:
571     * As above. If the Tcl_UniChar does not exist in the given string,
572     * the return value is NULL.
573     *
574     * Side effects:
575     * None.
576     *
577     *---------------------------------------------------------------------------
578     */
579    
580     char *
581     Tcl_UtfFindLast(string, ch)
582     CONST char *string; /* The UTF-8 string to be searched. */
583     int ch; /* The Tcl_UniChar to search for. */
584     {
585     int len;
586     Tcl_UniChar find;
587     CONST char *last;
588    
589     last = NULL;
590     while (1) {
591     len = Tcl_UtfToUniChar(string, &find);
592     if (find == ch) {
593     last = string;
594     }
595     if (*string == '\0') {
596     break;
597     }
598     string += len;
599     }
600     return (char *) last;
601     }
602    
603     /*
604     *---------------------------------------------------------------------------
605     *
606     * Tcl_UtfNext --
607     *
608     * Given a pointer to some current location in a UTF-8 string,
609     * move forward one character. The caller must ensure that they
610     * are not asking for the next character after the last character
611     * in the string.
612     *
613     * Results:
614     * The return value is the pointer to the next character in
615     * the UTF-8 string.
616     *
617     * Side effects:
618     * None.
619     *
620     *---------------------------------------------------------------------------
621     */
622    
623     char *
624     Tcl_UtfNext(str)
625     CONST char *str; /* The current location in the string. */
626     {
627     Tcl_UniChar ch;
628    
629     return (char *) str + Tcl_UtfToUniChar(str, &ch);
630     }
631    
632     /*
633     *---------------------------------------------------------------------------
634     *
635     * Tcl_UtfPrev --
636     *
637     * Given a pointer to some current location in a UTF-8 string,
638     * move backwards one character.
639     *
640     * Results:
641     * The return value is a pointer to the previous character in the
642     * UTF-8 string. If the current location was already at the
643     * beginning of the string, the return value will also be a
644     * pointer to the beginning of the string.
645     *
646     * Side effects:
647     * None.
648     *
649     *---------------------------------------------------------------------------
650     */
651    
652     char *
653     Tcl_UtfPrev(str, start)
654     CONST char *str; /* The current location in the string. */
655     CONST char *start; /* Pointer to the beginning of the
656     * string, to avoid going backwards too
657     * far. */
658     {
659     CONST char *look;
660     int i, byte;
661    
662     str--;
663     look = str;
664     for (i = 0; i < TCL_UTF_MAX; i++) {
665     if (look < start) {
666     if (str < start) {
667     str = start;
668     }
669     break;
670     }
671     byte = *((unsigned char *) look);
672     if (byte < 0x80) {
673     break;
674     }
675     if (byte >= 0xC0) {
676     if (totalBytes[byte] != i + 1) {
677     break;
678     }
679     return (char *) look;
680     }
681     look--;
682     }
683     return (char *) str;
684     }
685    
686     /*
687     *---------------------------------------------------------------------------
688     *
689     * Tcl_UniCharAtIndex --
690     *
691     * Returns the Unicode character represented at the specified
692     * character (not byte) position in the UTF-8 string.
693     *
694     * Results:
695     * As above.
696     *
697     * Side effects:
698     * None.
699     *
700     *---------------------------------------------------------------------------
701     */
702    
703     Tcl_UniChar
704     Tcl_UniCharAtIndex(src, index)
705     register CONST char *src; /* The UTF-8 string to dereference. */
706     register int index; /* The position of the desired character. */
707     {
708     Tcl_UniChar ch;
709    
710     while (index >= 0) {
711     index--;
712     src += Tcl_UtfToUniChar(src, &ch);
713     }
714     return ch;
715     }
716    
717     /*
718     *---------------------------------------------------------------------------
719     *
720     * Tcl_UtfAtIndex --
721     *
722     * Returns a pointer to the specified character (not byte) position
723     * in the UTF-8 string.
724     *
725     * Results:
726     * As above.
727     *
728     * Side effects:
729     * None.
730     *
731     *---------------------------------------------------------------------------
732     */
733    
734     char *
735     Tcl_UtfAtIndex(src, index)
736     register CONST char *src; /* The UTF-8 string. */
737     register int index; /* The position of the desired character. */
738     {
739     Tcl_UniChar ch;
740    
741     while (index > 0) {
742     index--;
743     src += Tcl_UtfToUniChar(src, &ch);
744     }
745     return (char *) src;
746     }
747    
748     /*
749     *---------------------------------------------------------------------------
750     *
751     * Tcl_UtfBackslash --
752     *
753     * Figure out how to handle a backslash sequence.
754     *
755     * Results:
756     * Stores the bytes represented by the backslash sequence in dst and
757     * returns the number of bytes written to dst. At most TCL_UTF_MAX
758     * bytes are written to dst; dst must have been large enough to accept
759     * those bytes. If readPtr isn't NULL then it is filled in with a
760     * count of the number of bytes in the backslash sequence.
761     *
762     * Side effects:
763     * The maximum number of bytes it takes to represent a Unicode
764     * character in UTF-8 is guaranteed to be less than the number of
765     * bytes used to express the backslash sequence that represents
766     * that Unicode character. If the target buffer into which the
767     * caller is going to store the bytes that represent the Unicode
768     * character is at least as large as the source buffer from which
769     * the backslashed sequence was extracted, no buffer overruns should
770     * occur.
771     *
772     *---------------------------------------------------------------------------
773     */
774    
775     int
776     Tcl_UtfBackslash(src, readPtr, dst)
777     CONST char *src; /* Points to the backslash character of
778     * a backslash sequence. */
779     int *readPtr; /* Fill in with number of characters read
780     * from src, unless NULL. */
781     char *dst; /* Filled with the bytes represented by the
782     * backslash sequence. */
783     {
784     register CONST char *p = src+1;
785     int result, count, n;
786     char buf[TCL_UTF_MAX];
787    
788     if (dst == NULL) {
789     dst = buf;
790     }
791    
792     count = 2;
793     switch (*p) {
794     /*
795     * Note: in the conversions below, use absolute values (e.g.,
796     * 0xa) rather than symbolic values (e.g. \n) that get converted
797     * by the compiler. It's possible that compilers on some
798     * platforms will do the symbolic conversions differently, which
799     * could result in non-portable Tcl scripts.
800     */
801    
802     case 'a':
803     result = 0x7;
804     break;
805     case 'b':
806     result = 0x8;
807     break;
808     case 'f':
809     result = 0xc;
810     break;
811     case 'n':
812     result = 0xa;
813     break;
814     case 'r':
815     result = 0xd;
816     break;
817     case 't':
818     result = 0x9;
819     break;
820     case 'v':
821     result = 0xb;
822     break;
823     case 'x':
824     if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
825     char *end;
826    
827     result = (unsigned char) strtoul(p+1, &end, 16);
828     count = end - src;
829     } else {
830     count = 2;
831     result = 'x';
832     }
833     break;
834     case 'u':
835     result = 0;
836     for (count = 0; count < 4; count++) {
837     p++;
838     if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
839     break;
840     }
841     n = *p - '0';
842     if (n > 9) {
843     n = n + '0' + 10 - 'A';
844     }
845     if (n > 16) {
846     n = n + 'A' - 'a';
847     }
848     result = (result << 4) + n;
849     }
850     if (count == 0) {
851     result = 'u';
852     }
853     count += 2;
854     break;
855    
856     case '\n':
857     do {
858     p++;
859     } while ((*p == ' ') || (*p == '\t'));
860     result = ' ';
861     count = p - src;
862     break;
863     case 0:
864     result = '\\';
865     count = 1;
866     break;
867     default:
868     /*
869     * Check for an octal number \oo?o?
870     */
871     if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
872     result = (unsigned char)(*p - '0');
873     p++;
874     if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
875     break;
876     }
877     count = 3;
878     result = (unsigned char)((result << 3) + (*p - '0'));
879     p++;
880     if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
881     break;
882     }
883     count = 4;
884     result = (unsigned char)((result << 3) + (*p - '0'));
885     break;
886     }
887     result = *p;
888     count = 2;
889     break;
890     }
891    
892     if (readPtr != NULL) {
893     *readPtr = count;
894     }
895     return Tcl_UniCharToUtf(result, dst);
896     }
897    
898     /*
899     *----------------------------------------------------------------------
900     *
901     * Tcl_UtfToUpper --
902     *
903     * Convert lowercase characters to uppercase characters in a UTF
904     * string in place. The conversion may shrink the UTF string.
905     *
906     * Results:
907     * Returns the number of bytes in the resulting string
908     * excluding the trailing null.
909     *
910     * Side effects:
911     * Writes a terminating null after the last converted character.
912     *
913     *----------------------------------------------------------------------
914     */
915    
916     int
917     Tcl_UtfToUpper(str)
918     char *str; /* String to convert in place. */
919     {
920     Tcl_UniChar ch, upChar;
921     char *src, *dst;
922     int bytes;
923    
924     /*
925     * Iterate over the string until we hit the terminating null.
926     */
927    
928     src = dst = str;
929     while (*src) {
930     bytes = Tcl_UtfToUniChar(src, &ch);
931     upChar = Tcl_UniCharToUpper(ch);
932    
933     /*
934     * To keep badly formed Utf strings from getting inflated by
935     * the conversion (thereby causing a segfault), only copy the
936     * upper case char to dst if its size is <= the original char.
937     */
938    
939     if (bytes < UtfCount(upChar)) {
940     memcpy(dst, src, (size_t) bytes);
941     dst += bytes;
942     } else {
943     dst += Tcl_UniCharToUtf(upChar, dst);
944     }
945     src += bytes;
946     }
947     *dst = '\0';
948     return (dst - str);
949     }
950    
951     /*
952     *----------------------------------------------------------------------
953     *
954     * Tcl_UtfToLower --
955     *
956     * Convert uppercase characters to lowercase characters in a UTF
957     * string in place. The conversion may shrink the UTF string.
958     *
959     * Results:
960     * Returns the number of bytes in the resulting string
961     * excluding the trailing null.
962     *
963     * Side effects:
964     * Writes a terminating null after the last converted character.
965     *
966     *----------------------------------------------------------------------
967     */
968    
969     int
970     Tcl_UtfToLower(str)
971     char *str; /* String to convert in place. */
972     {
973     Tcl_UniChar ch, lowChar;
974     char *src, *dst;
975     int bytes;
976    
977     /*
978     * Iterate over the string until we hit the terminating null.
979     */
980    
981     src = dst = str;
982     while (*src) {
983     bytes = Tcl_UtfToUniChar(src, &ch);
984     lowChar = Tcl_UniCharToLower(ch);
985    
986     /*
987     * To keep badly formed Utf strings from getting inflated by
988     * the conversion (thereby causing a segfault), only copy the
989     * lower case char to dst if its size is <= the original char.
990     */
991    
992     if (bytes < UtfCount(lowChar)) {
993     memcpy(dst, src, (size_t) bytes);
994     dst += bytes;
995     } else {
996     dst += Tcl_UniCharToUtf(lowChar, dst);
997     }
998     src += bytes;
999     }
1000     *dst = '\0';
1001     return (dst - str);
1002     }
1003    
1004     /*
1005     *----------------------------------------------------------------------
1006     *
1007     * Tcl_UtfToTitle --
1008     *
1009     * Changes the first character of a UTF string to title case or
1010     * uppercase and the rest of the string to lowercase. The
1011     * conversion happens in place and may shrink the UTF string.
1012     *
1013     * Results:
1014     * Returns the number of bytes in the resulting string
1015     * excluding the trailing null.
1016     *
1017     * Side effects:
1018     * Writes a terminating null after the last converted character.
1019     *
1020     *----------------------------------------------------------------------
1021     */
1022    
1023     int
1024     Tcl_UtfToTitle(str)
1025     char *str; /* String to convert in place. */
1026     {
1027     Tcl_UniChar ch, titleChar, lowChar;
1028     char *src, *dst;
1029     int bytes;
1030    
1031     /*
1032     * Capitalize the first character and then lowercase the rest of the
1033     * characters until we get to a null.
1034     */
1035    
1036     src = dst = str;
1037    
1038     if (*src) {
1039     bytes = Tcl_UtfToUniChar(src, &ch);
1040     titleChar = Tcl_UniCharToTitle(ch);
1041    
1042     if (bytes < UtfCount(titleChar)) {
1043     memcpy(dst, src, (size_t) bytes);
1044     dst += bytes;
1045     } else {
1046     dst += Tcl_UniCharToUtf(titleChar, dst);
1047     }
1048     src += bytes;
1049     }
1050     while (*src) {
1051     bytes = Tcl_UtfToUniChar(src, &ch);
1052     lowChar = Tcl_UniCharToLower(ch);
1053    
1054     if (bytes < UtfCount(lowChar)) {
1055     memcpy(dst, src, (size_t) bytes);
1056     dst += bytes;
1057     } else {
1058     dst += Tcl_UniCharToUtf(lowChar, dst);
1059     }
1060     src += bytes;
1061     }
1062     *dst = '\0';
1063     return (dst - str);
1064     }
1065    
1066     /*
1067     *----------------------------------------------------------------------
1068     *
1069     * Tcl_UtfNcmp --
1070     *
1071     * Compare at most n UTF chars of string cs to string ct. Both cs
1072     * and ct are assumed to be at least n UTF chars long.
1073     *
1074     * Results:
1075     * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1076     *
1077     * Side effects:
1078     * None.
1079     *
1080     *----------------------------------------------------------------------
1081     */
1082    
1083     int
1084     Tcl_UtfNcmp(cs, ct, n)
1085     CONST char *cs; /* UTF string to compare to ct. */
1086     CONST char *ct; /* UTF string cs is compared to. */
1087     unsigned long n; /* Number of UTF chars to compare. */
1088     {
1089     Tcl_UniChar ch1, ch2;
1090     /*
1091     * Another approach that should work is:
1092     * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
1093     * That assumes that ct is a properly formed UTF, so we will just
1094     * be comparing the bytes that compromise those strings to the
1095     * char length n.
1096     */
1097     while (n-- > 0) {
1098     /*
1099     * n must be interpreted as chars, not bytes.
1100     * This should be called only when both strings are of
1101     * at least n chars long (no need for \0 check)
1102     */
1103     cs += Tcl_UtfToUniChar(cs, &ch1);
1104     ct += Tcl_UtfToUniChar(ct, &ch2);
1105     if (ch1 != ch2) {
1106     return (ch1 - ch2);
1107     }
1108     }
1109     return 0;
1110     }
1111    
1112     /*
1113     *----------------------------------------------------------------------
1114     *
1115     * Tcl_UtfNcasecmp --
1116     *
1117     * Compare at most n UTF chars of string cs to string ct case
1118     * insensitive. Both cs and ct are assumed to be at least n
1119     * UTF chars long.
1120     *
1121     * Results:
1122     * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1123     *
1124     * Side effects:
1125     * None.
1126     *
1127     *----------------------------------------------------------------------
1128     */
1129    
1130     int
1131     Tcl_UtfNcasecmp(cs, ct, n)
1132     CONST char *cs; /* UTF string to compare to ct. */
1133     CONST char *ct; /* UTF string cs is compared to. */
1134     unsigned long n; /* Number of UTF chars to compare. */
1135     {
1136     Tcl_UniChar ch1, ch2;
1137     while (n-- > 0) {
1138     /*
1139     * n must be interpreted as chars, not bytes.
1140     * This should be called only when both strings are of
1141     * at least n chars long (no need for \0 check)
1142     */
1143     cs += Tcl_UtfToUniChar(cs, &ch1);
1144     ct += Tcl_UtfToUniChar(ct, &ch2);
1145     if (ch1 != ch2) {
1146     ch1 = Tcl_UniCharToLower(ch1);
1147     ch2 = Tcl_UniCharToLower(ch2);
1148     if (ch1 != ch2) {
1149     return (ch1 - ch2);
1150     }
1151     }
1152     }
1153     return 0;
1154     }
1155    
1156     /*
1157     *----------------------------------------------------------------------
1158     *
1159     * Tcl_UniCharToUpper --
1160     *
1161     * Compute the uppercase equivalent of the given Unicode character.
1162     *
1163     * Results:
1164     * Returns the uppercase Unicode character.
1165     *
1166     * Side effects:
1167     * None.
1168     *
1169     *----------------------------------------------------------------------
1170     */
1171    
1172     Tcl_UniChar
1173     Tcl_UniCharToUpper(ch)
1174     int ch; /* Unicode character to convert. */
1175     {
1176     int info = GetUniCharInfo(ch);
1177    
1178     if (GetCaseType(info) & 0x04) {
1179     return (Tcl_UniChar) (ch - GetDelta(info));
1180     } else {
1181     return ch;
1182     }
1183     }
1184    
1185     /*
1186     *----------------------------------------------------------------------
1187     *
1188     * Tcl_UniCharToLower --
1189     *
1190     * Compute the lowercase equivalent of the given Unicode character.
1191     *
1192     * Results:
1193     * Returns the lowercase Unicode character.
1194     *
1195     * Side effects:
1196     * None.
1197     *
1198     *----------------------------------------------------------------------
1199     */
1200    
1201     Tcl_UniChar
1202     Tcl_UniCharToLower(ch)
1203     int ch; /* Unicode character to convert. */
1204     {
1205     int info = GetUniCharInfo(ch);
1206    
1207     if (GetCaseType(info) & 0x02) {
1208     return (Tcl_UniChar) (ch + GetDelta(info));
1209     } else {
1210     return ch;
1211     }
1212     }
1213    
1214     /*
1215     *----------------------------------------------------------------------
1216     *
1217     * Tcl_UniCharToTitle --
1218     *
1219     * Compute the titlecase equivalent of the given Unicode character.
1220     *
1221     * Results:
1222     * Returns the titlecase Unicode character.
1223     *
1224     * Side effects:
1225     * None.
1226     *
1227     *----------------------------------------------------------------------
1228     */
1229    
1230     Tcl_UniChar
1231     Tcl_UniCharToTitle(ch)
1232     int ch; /* Unicode character to convert. */
1233     {
1234     int info = GetUniCharInfo(ch);
1235     int mode = GetCaseType(info);
1236    
1237     if (mode & 0x1) {
1238     /*
1239     * Subtract or add one depending on the original case.
1240     */
1241    
1242     return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1243     } else if (mode == 0x4) {
1244     return (Tcl_UniChar) (ch - GetDelta(info));
1245     } else {
1246     return ch;
1247     }
1248     }
1249    
1250     /*
1251     *----------------------------------------------------------------------
1252     *
1253     * Tcl_UniCharLen --
1254     *
1255     * Find the length of a UniChar string. The str input must be null
1256     * terminated.
1257     *
1258     * Results:
1259     * Returns the length of str in UniChars (not bytes).
1260     *
1261     * Side effects:
1262     * None.
1263     *
1264     *----------------------------------------------------------------------
1265     */
1266    
1267     int
1268     Tcl_UniCharLen(str)
1269     Tcl_UniChar *str; /* Unicode string to find length of. */
1270     {
1271     int len = 0;
1272    
1273     while (*str != '\0') {
1274     len++;
1275     str++;
1276     }
1277     return len;
1278     }
1279    
1280     /*
1281     *----------------------------------------------------------------------
1282     *
1283     * Tcl_UniCharNcmp --
1284     *
1285     * Compare at most n unichars of string cs to string ct. Both cs
1286     * and ct are assumed to be at least n unichars long.
1287     *
1288     * Results:
1289     * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1290     *
1291     * Side effects:
1292     * None.
1293     *
1294     *----------------------------------------------------------------------
1295     */
1296    
1297     int
1298     Tcl_UniCharNcmp(cs, ct, n)
1299     CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1300     CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1301     unsigned long n; /* Number of unichars to compare. */
1302     {
1303     for ( ; n != 0; n--, cs++, ct++) {
1304     if (*cs != *ct) {
1305     return *cs - *ct;
1306     }
1307     if (*cs == '\0') {
1308     break;
1309     }
1310     }
1311     return 0;
1312     }
1313    
1314     /*
1315     *----------------------------------------------------------------------
1316     *
1317     * Tcl_UniCharIsAlnum --
1318     *
1319     * Test if a character is an alphanumeric Unicode character.
1320     *
1321     * Results:
1322     * Returns 1 if character is alphanumeric.
1323     *
1324     * Side effects:
1325     * None.
1326     *
1327     *----------------------------------------------------------------------
1328     */
1329    
1330     int
1331     Tcl_UniCharIsAlnum(ch)
1332     int ch; /* Unicode character to test. */
1333     {
1334     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1335    
1336     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1337     }
1338    
1339     /*
1340     *----------------------------------------------------------------------
1341     *
1342     * Tcl_UniCharIsAlpha --
1343     *
1344     * Test if a character is an alphabetic Unicode character.
1345     *
1346     * Results:
1347     * Returns 1 if character is alphabetic.
1348     *
1349     * Side effects:
1350     * None.
1351     *
1352     *----------------------------------------------------------------------
1353     */
1354    
1355     int
1356     Tcl_UniCharIsAlpha(ch)
1357     int ch; /* Unicode character to test. */
1358     {
1359     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1360     return ((ALPHA_BITS >> category) & 1);
1361     }
1362    
1363     /*
1364     *----------------------------------------------------------------------
1365     *
1366     * Tcl_UniCharIsControl --
1367     *
1368     * Test if a character is a Unicode control character.
1369     *
1370     * Results:
1371     * Returns non-zero if character is a control.
1372     *
1373     * Side effects:
1374     * None.
1375     *
1376     *----------------------------------------------------------------------
1377     */
1378    
1379     int
1380     Tcl_UniCharIsControl(ch)
1381     int ch; /* Unicode character to test. */
1382     {
1383     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1384     }
1385    
1386     /*
1387     *----------------------------------------------------------------------
1388     *
1389     * Tcl_UniCharIsDigit --
1390     *
1391     * Test if a character is a numeric Unicode character.
1392     *
1393     * Results:
1394     * Returns non-zero if character is a digit.
1395     *
1396     * Side effects:
1397     * None.
1398     *
1399     *----------------------------------------------------------------------
1400     */
1401    
1402     int
1403     Tcl_UniCharIsDigit(ch)
1404     int ch; /* Unicode character to test. */
1405     {
1406     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1407     == DECIMAL_DIGIT_NUMBER);
1408     }
1409    
1410     /*
1411     *----------------------------------------------------------------------
1412     *
1413     * Tcl_UniCharIsGraph --
1414     *
1415     * Test if a character is any Unicode print character except space.
1416     *
1417     * Results:
1418     * Returns non-zero if character is printable, but not space.
1419     *
1420     * Side effects:
1421     * None.
1422     *
1423     *----------------------------------------------------------------------
1424     */
1425    
1426     int
1427     Tcl_UniCharIsGraph(ch)
1428     int ch; /* Unicode character to test. */
1429     {
1430     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1431     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1432     }
1433    
1434     /*
1435     *----------------------------------------------------------------------
1436     *
1437     * Tcl_UniCharIsLower --
1438     *
1439     * Test if a character is a lowercase Unicode character.
1440     *
1441     * Results:
1442     * Returns non-zero if character is lowercase.
1443     *
1444     * Side effects:
1445     * None.
1446     *
1447     *----------------------------------------------------------------------
1448     */
1449    
1450     int
1451     Tcl_UniCharIsLower(ch)
1452     int ch; /* Unicode character to test. */
1453     {
1454     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1455     }
1456    
1457     /*
1458     *----------------------------------------------------------------------
1459     *
1460     * Tcl_UniCharIsPrint --
1461     *
1462     * Test if a character is a Unicode print character.
1463     *
1464     * Results:
1465     * Returns non-zero if character is printable.
1466     *
1467     * Side effects:
1468     * None.
1469     *
1470     *----------------------------------------------------------------------
1471     */
1472    
1473     int
1474     Tcl_UniCharIsPrint(ch)
1475     int ch; /* Unicode character to test. */
1476     {
1477     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1478     return ((PRINT_BITS >> category) & 1);
1479     }
1480    
1481     /*
1482     *----------------------------------------------------------------------
1483     *
1484     * Tcl_UniCharIsPunct --
1485     *
1486     * Test if a character is a Unicode punctuation character.
1487     *
1488     * Results:
1489     * Returns non-zero if character is punct.
1490     *
1491     * Side effects:
1492     * None.
1493     *
1494     *----------------------------------------------------------------------
1495     */
1496    
1497     int
1498     Tcl_UniCharIsPunct(ch)
1499     int ch; /* Unicode character to test. */
1500     {
1501     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1502     return ((PUNCT_BITS >> category) & 1);
1503     }
1504    
1505     /*
1506     *----------------------------------------------------------------------
1507     *
1508     * Tcl_UniCharIsSpace --
1509     *
1510     * Test if a character is a whitespace Unicode character.
1511     *
1512     * Results:
1513     * Returns non-zero if character is a space.
1514     *
1515     * Side effects:
1516     * None.
1517     *
1518     *----------------------------------------------------------------------
1519     */
1520    
1521     int
1522     Tcl_UniCharIsSpace(ch)
1523     int ch; /* Unicode character to test. */
1524     {
1525     register int category;
1526    
1527     /*
1528     * If the character is within the first 127 characters, just use the
1529     * standard C function, otherwise consult the Unicode table.
1530     */
1531    
1532     if (ch < 0x80) {
1533     return isspace(UCHAR(ch)); /* INTL: ISO space */
1534     } else {
1535     category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1536     return ((SPACE_BITS >> category) & 1);
1537     }
1538     }
1539    
1540     /*
1541     *----------------------------------------------------------------------
1542     *
1543     * Tcl_UniCharIsUpper --
1544     *
1545     * Test if a character is a uppercase Unicode character.
1546     *
1547     * Results:
1548     * Returns non-zero if character is uppercase.
1549     *
1550     * Side effects:
1551     * None.
1552     *
1553     *----------------------------------------------------------------------
1554     */
1555    
1556     int
1557     Tcl_UniCharIsUpper(ch)
1558     int ch; /* Unicode character to test. */
1559     {
1560     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1561     }
1562    
1563     /*
1564     *----------------------------------------------------------------------
1565     *
1566     * Tcl_UniCharIsWordChar --
1567     *
1568     * Test if a character is alphanumeric or a connector punctuation
1569     * mark.
1570     *
1571     * Results:
1572     * Returns 1 if character is a word character.
1573     *
1574     * Side effects:
1575     * None.
1576     *
1577     *----------------------------------------------------------------------
1578     */
1579    
1580     int
1581     Tcl_UniCharIsWordChar(ch)
1582     int ch; /* Unicode character to test. */
1583     {
1584     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1585    
1586     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1587     }
1588    
1589 dashley 67 /* End of tclutf.c */

Properties

Name Value
svn:keywords Header

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25