/[dtapublic]/projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c
ViewVC logotype

Annotation of /projs/trunk/shared_source/c_tcl_base_7_5_w_mods/tclutf.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 44 - (hide annotations) (download)
Fri Oct 14 02:09:58 2016 UTC (8 years ago) by dashley
File MIME type: text/plain
File size: 39815 byte(s)
Rename for reorganization.
1 dashley 25 /* $Header: /cvsroot/esrg/sfesrg/esrgpcpj/shared/tcl_base/tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $ */
2    
3     /*
4     * tclUtf.c --
5     *
6     * Routines for manipulating UTF-8 strings.
7     *
8     * Copyright (c) 1997-1998 Sun Microsystems, Inc.
9     *
10     * See the file "license.terms" for information on usage and redistribution
11     * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
12     *
13     * RCS: @(#) $Id: tclutf.c,v 1.1.1.1 2001/06/13 04:47:01 dtashley Exp $
14     */
15    
16     #include "tclInt.h"
17    
18     /*
19     * Include the static character classification tables and macros.
20     */
21    
22     #include "tclUniData.c"
23    
24     /*
25     * The following macros are used for fast character category tests. The
26     * x_BITS values are shifted right by the category value to determine whether
27     * the given category is included in the set.
28     */
29    
30     #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
31     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
32    
33     #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
34    
35     #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
36     | (1 << PARAGRAPH_SEPARATOR))
37    
38     #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
39    
40     #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
41     (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
42     (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
43     (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
44     (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
45     (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
46     (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
47     (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
48     (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
49    
50     #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
51     (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
52     (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
53     (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
54    
55     /*
56     * Unicode characters less than this value are represented by themselves
57     * in UTF-8 strings.
58     */
59    
60     #define UNICODE_SELF 0x80
61    
62     /*
63     * The following structures are used when mapping between Unicode (UCS-2)
64     * and UTF-8.
65     */
66    
67     CONST unsigned char totalBytes[256] = {
68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
73     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
74     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
75     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
76     #if TCL_UTF_MAX > 3
77     4,4,4,4,4,4,4,4,
78     #else
79     1,1,1,1,1,1,1,1,
80     #endif
81     #if TCL_UTF_MAX > 4
82     5,5,5,5,
83     #else
84     1,1,1,1,
85     #endif
86     #if TCL_UTF_MAX > 5
87     6,6,6,6
88     #else
89     1,1,1,1
90     #endif
91     };
92    
93     /*
94     * Procedures used only in this module.
95     */
96    
97     static int UtfCount _ANSI_ARGS_((int ch));
98    
99    
100     /*
101     *---------------------------------------------------------------------------
102     *
103     * UtfCount --
104     *
105     * Find the number of bytes in the Utf character "ch".
106     *
107     * Results:
108     * The return values is the number of bytes in the Utf character "ch".
109     *
110     * Side effects:
111     * None.
112     *
113     *---------------------------------------------------------------------------
114     */
115    
116     static int
117     UtfCount(ch)
118     int ch; /* The Tcl_UniChar whose size is returned. */
119     {
120     if ((ch > 0) && (ch < UNICODE_SELF)) {
121     return 1;
122     }
123     if (ch <= 0x7FF) {
124     return 2;
125     }
126     if (ch <= 0xFFFF) {
127     return 3;
128     }
129     #if TCL_UTF_MAX > 3
130     if (ch <= 0x1FFFFF) {
131     return 4;
132     }
133     if (ch <= 0x3FFFFFF) {
134     return 5;
135     }
136     if (ch <= 0x7FFFFFFF) {
137     return 6;
138     }
139     #endif
140     return 3;
141     }
142    
143     /*
144     *---------------------------------------------------------------------------
145     *
146     * Tcl_UniCharToUtf --
147     *
148     * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
149     * provided buffer. Equivalent to Plan 9 runetochar().
150     *
151     * Results:
152     * The return values is the number of bytes in the buffer that
153     * were consumed.
154     *
155     * Side effects:
156     * None.
157     *
158     *---------------------------------------------------------------------------
159     */
160    
161     INLINE int
162     Tcl_UniCharToUtf(ch, str)
163     int ch; /* The Tcl_UniChar to be stored in the
164     * buffer. */
165     char *str; /* Buffer in which the UTF-8 representation
166     * of the Tcl_UniChar is stored. Buffer must
167     * be large enough to hold the UTF-8 character
168     * (at most TCL_UTF_MAX bytes). */
169     {
170     if ((ch > 0) && (ch < UNICODE_SELF)) {
171     str[0] = (char) ch;
172     return 1;
173     }
174     if (ch <= 0x7FF) {
175     str[1] = (char) ((ch | 0x80) & 0xBF);
176     str[0] = (char) ((ch >> 6) | 0xC0);
177     return 2;
178     }
179     if (ch <= 0xFFFF) {
180     three:
181     str[2] = (char) ((ch | 0x80) & 0xBF);
182     str[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
183     str[0] = (char) ((ch >> 12) | 0xE0);
184     return 3;
185     }
186    
187     #if TCL_UTF_MAX > 3
188     if (ch <= 0x1FFFFF) {
189     str[3] = (char) ((ch | 0x80) & 0xBF);
190     str[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
191     str[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
192     str[0] = (char) ((ch >> 18) | 0xF0);
193     return 4;
194     }
195     if (ch <= 0x3FFFFFF) {
196     str[4] = (char) ((ch | 0x80) & 0xBF);
197     str[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
198     str[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
199     str[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
200     str[0] = (char) ((ch >> 24) | 0xF8);
201     return 5;
202     }
203     if (ch <= 0x7FFFFFFF) {
204     str[5] = (char) ((ch | 0x80) & 0xBF);
205     str[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
206     str[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
207     str[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
208     str[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
209     str[0] = (char) ((ch >> 30) | 0xFC);
210     return 6;
211     }
212     #endif
213    
214     ch = 0xFFFD;
215     goto three;
216     }
217    
218     /*
219     *---------------------------------------------------------------------------
220     *
221     * Tcl_UniCharToUtfDString --
222     *
223     * Convert the given Unicode string to UTF-8.
224     *
225     * Results:
226     * The return value is a pointer to the UTF-8 representation of the
227     * Unicode string. Storage for the return value is appended to the
228     * end of dsPtr.
229     *
230     * Side effects:
231     * None.
232     *
233     *---------------------------------------------------------------------------
234     */
235    
236     char *
237     Tcl_UniCharToUtfDString(wString, numChars, dsPtr)
238     CONST Tcl_UniChar *wString; /* Unicode string to convert to UTF-8. */
239     int numChars; /* Length of Unicode string in Tcl_UniChars
240     * (must be >= 0). */
241     Tcl_DString *dsPtr; /* UTF-8 representation of string is
242     * appended to this previously initialized
243     * DString. */
244     {
245     CONST Tcl_UniChar *w, *wEnd;
246     char *p, *string;
247     int oldLength;
248    
249     /*
250     * UTF-8 string length in bytes will be <= Unicode string length *
251     * TCL_UTF_MAX.
252     */
253    
254     oldLength = Tcl_DStringLength(dsPtr);
255     Tcl_DStringSetLength(dsPtr, (oldLength + numChars + 1) * TCL_UTF_MAX);
256     string = Tcl_DStringValue(dsPtr) + oldLength;
257    
258     p = string;
259     wEnd = wString + numChars;
260     for (w = wString; w < wEnd; ) {
261     p += Tcl_UniCharToUtf(*w, p);
262     w++;
263     }
264     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
265    
266     return string;
267     }
268    
269     /*
270     *---------------------------------------------------------------------------
271     *
272     * Tcl_UtfToUniChar --
273     *
274     * Extract the Tcl_UniChar represented by the UTF-8 string. Bad
275     * UTF-8 sequences are converted to valid Tcl_UniChars and processing
276     * continues. Equivalent to Plan 9 chartorune().
277     *
278     * The caller must ensure that the source buffer is long enough that
279     * this routine does not run off the end and dereference non-existent
280     * memory looking for trail bytes. If the source buffer is known to
281     * be '\0' terminated, this cannot happen. Otherwise, the caller
282     * should call Tcl_UtfCharComplete() before calling this routine to
283     * ensure that enough bytes remain in the string.
284     *
285     * Results:
286     * *chPtr is filled with the Tcl_UniChar, and the return value is the
287     * number of bytes from the UTF-8 string that were consumed.
288     *
289     * Side effects:
290     * None.
291     *
292     *---------------------------------------------------------------------------
293     */
294    
295     int
296     Tcl_UtfToUniChar(str, chPtr)
297     register CONST char *str; /* The UTF-8 string. */
298     register Tcl_UniChar *chPtr; /* Filled with the Tcl_UniChar represented
299     * by the UTF-8 string. */
300     {
301     register int byte;
302    
303     /*
304     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
305     */
306    
307     byte = *((unsigned char *) str);
308     if (byte < 0xC0) {
309     /*
310     * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
311     * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
312     * characters representing themselves.
313     */
314    
315     *chPtr = (Tcl_UniChar) byte;
316     return 1;
317     } else if (byte < 0xE0) {
318     if ((str[1] & 0xC0) == 0x80) {
319     /*
320     * Two-byte-character lead-byte followed by a trail-byte.
321     */
322    
323     *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (str[1] & 0x3F));
324     return 2;
325     }
326     /*
327     * A two-byte-character lead-byte not followed by trail-byte
328     * represents itself.
329     */
330    
331     *chPtr = (Tcl_UniChar) byte;
332     return 1;
333     } else if (byte < 0xF0) {
334     if (((str[1] & 0xC0) == 0x80) && ((str[2] & 0xC0) == 0x80)) {
335     /*
336     * Three-byte-character lead byte followed by two trail bytes.
337     */
338    
339     *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
340     | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
341     return 3;
342     }
343     /*
344     * A three-byte-character lead-byte not followed by two trail-bytes
345     * represents itself.
346     */
347    
348     *chPtr = (Tcl_UniChar) byte;
349     return 1;
350     }
351     #if TCL_UTF_MAX > 3
352     else {
353     int ch, total, trail;
354    
355     total = totalBytes[byte];
356     trail = total - 1;
357     if (trail > 0) {
358     ch = byte & (0x3F >> trail);
359     do {
360     str++;
361     if ((*str & 0xC0) != 0x80) {
362     *chPtr = byte;
363     return 1;
364     }
365     ch <<= 6;
366     ch |= (*str & 0x3F);
367     trail--;
368     } while (trail > 0);
369     *chPtr = ch;
370     return total;
371     }
372     }
373     #endif
374    
375     *chPtr = (Tcl_UniChar) byte;
376     return 1;
377     }
378    
379     /*
380     *---------------------------------------------------------------------------
381     *
382     * Tcl_UtfToUniCharDString --
383     *
384     * Convert the UTF-8 string to Unicode.
385     *
386     * Results:
387     * The return value is a pointer to the Unicode representation of the
388     * UTF-8 string. Storage for the return value is appended to the
389     * end of dsPtr. The Unicode string is terminated with a Unicode
390     * NULL character.
391     *
392     * Side effects:
393     * None.
394     *
395     *---------------------------------------------------------------------------
396     */
397    
398     Tcl_UniChar *
399     Tcl_UtfToUniCharDString(string, length, dsPtr)
400     CONST char *string; /* UTF-8 string to convert to Unicode. */
401     int length; /* Length of UTF-8 string in bytes, or -1
402     * for strlen(). */
403     Tcl_DString *dsPtr; /* Unicode representation of string is
404     * appended to this previously initialized
405     * DString. */
406     {
407     Tcl_UniChar *w, *wString;
408     CONST char *p, *end;
409     int oldLength;
410    
411     if (length < 0) {
412     length = strlen(string);
413     }
414    
415     /*
416     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length
417     * in bytes.
418     */
419    
420     oldLength = Tcl_DStringLength(dsPtr);
421     Tcl_DStringSetLength(dsPtr,
422     (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
423     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
424    
425     w = wString;
426     end = string + length;
427     for (p = string; p < end; ) {
428     p += Tcl_UtfToUniChar(p, w);
429     w++;
430     }
431     *w = '\0';
432     Tcl_DStringSetLength(dsPtr,
433     (oldLength + ((char *) w - (char *) wString)));
434    
435     return wString;
436     }
437    
438     /*
439     *---------------------------------------------------------------------------
440     *
441     * Tcl_UtfCharComplete --
442     *
443     * Determine if the UTF-8 string of the given length is long enough
444     * to be decoded by Tcl_UtfToUniChar(). This does not ensure that the
445     * UTF-8 string is properly formed. Equivalent to Plan 9 fullrune().
446     *
447     * Results:
448     * The return value is 0 if the string is not long enough, non-zero
449     * otherwise.
450     *
451     * Side effects:
452     * None.
453     *
454     *---------------------------------------------------------------------------
455     */
456    
457     int
458     Tcl_UtfCharComplete(str, len)
459     CONST char *str; /* String to check if first few bytes
460     * contain a complete UTF-8 character. */
461     int len; /* Length of above string in bytes. */
462     {
463     int ch;
464    
465     ch = *((unsigned char *) str);
466     return len >= totalBytes[ch];
467     }
468    
469     /*
470     *---------------------------------------------------------------------------
471     *
472     * Tcl_NumUtfChars --
473     *
474     * Returns the number of characters (not bytes) in the UTF-8 string,
475     * not including the terminating NULL byte. This is equivalent to
476     * Plan 9 utflen() and utfnlen().
477     *
478     * Results:
479     * As above.
480     *
481     * Side effects:
482     * None.
483     *
484     *---------------------------------------------------------------------------
485     */
486    
487     int
488     Tcl_NumUtfChars(str, len)
489     register CONST char *str; /* The UTF-8 string to measure. */
490     int len; /* The length of the string in bytes, or -1
491     * for strlen(string). */
492     {
493     Tcl_UniChar ch;
494     register Tcl_UniChar *chPtr = &ch;
495     register int n;
496     int i;
497    
498     /*
499     * The separate implementations are faster.
500     */
501    
502     i = 0;
503     if (len < 0) {
504     while (1) {
505     str += Tcl_UtfToUniChar(str, chPtr);
506     if (ch == '\0') {
507     break;
508     }
509     i++;
510     }
511     } else {
512     while (len > 0) {
513     n = Tcl_UtfToUniChar(str, chPtr);
514     len -= n;
515     str += n;
516     i++;
517     }
518     }
519     return i;
520     }
521    
522     /*
523     *---------------------------------------------------------------------------
524     *
525     * Tcl_UtfFindFirst --
526     *
527     * Returns a pointer to the first occurance of the given Tcl_UniChar
528     * in the NULL-terminated UTF-8 string. The NULL terminator is
529     * considered part of the UTF-8 string. Equivalent to Plan 9
530     * utfrune().
531     *
532     * Results:
533     * As above. If the Tcl_UniChar does not exist in the given string,
534     * the return value is NULL.
535     *
536     * Side effects:
537     * None.
538     *
539     *---------------------------------------------------------------------------
540     */
541     char *
542     Tcl_UtfFindFirst(string, ch)
543     CONST char *string; /* The UTF-8 string to be searched. */
544     int ch; /* The Tcl_UniChar to search for. */
545     {
546     int len;
547     Tcl_UniChar find;
548    
549     while (1) {
550     len = Tcl_UtfToUniChar(string, &find);
551     if (find == ch) {
552     return (char *) string;
553     }
554     if (*string == '\0') {
555     return NULL;
556     }
557     string += len;
558     }
559     }
560    
561     /*
562     *---------------------------------------------------------------------------
563     *
564     * Tcl_UtfFindLast --
565     *
566     * Returns a pointer to the last occurance of the given Tcl_UniChar
567     * in the NULL-terminated UTF-8 string. The NULL terminator is
568     * considered part of the UTF-8 string. Equivalent to Plan 9
569     * utfrrune().
570     *
571     * Results:
572     * As above. If the Tcl_UniChar does not exist in the given string,
573     * the return value is NULL.
574     *
575     * Side effects:
576     * None.
577     *
578     *---------------------------------------------------------------------------
579     */
580    
581     char *
582     Tcl_UtfFindLast(string, ch)
583     CONST char *string; /* The UTF-8 string to be searched. */
584     int ch; /* The Tcl_UniChar to search for. */
585     {
586     int len;
587     Tcl_UniChar find;
588     CONST char *last;
589    
590     last = NULL;
591     while (1) {
592     len = Tcl_UtfToUniChar(string, &find);
593     if (find == ch) {
594     last = string;
595     }
596     if (*string == '\0') {
597     break;
598     }
599     string += len;
600     }
601     return (char *) last;
602     }
603    
604     /*
605     *---------------------------------------------------------------------------
606     *
607     * Tcl_UtfNext --
608     *
609     * Given a pointer to some current location in a UTF-8 string,
610     * move forward one character. The caller must ensure that they
611     * are not asking for the next character after the last character
612     * in the string.
613     *
614     * Results:
615     * The return value is the pointer to the next character in
616     * the UTF-8 string.
617     *
618     * Side effects:
619     * None.
620     *
621     *---------------------------------------------------------------------------
622     */
623    
624     char *
625     Tcl_UtfNext(str)
626     CONST char *str; /* The current location in the string. */
627     {
628     Tcl_UniChar ch;
629    
630     return (char *) str + Tcl_UtfToUniChar(str, &ch);
631     }
632    
633     /*
634     *---------------------------------------------------------------------------
635     *
636     * Tcl_UtfPrev --
637     *
638     * Given a pointer to some current location in a UTF-8 string,
639     * move backwards one character.
640     *
641     * Results:
642     * The return value is a pointer to the previous character in the
643     * UTF-8 string. If the current location was already at the
644     * beginning of the string, the return value will also be a
645     * pointer to the beginning of the string.
646     *
647     * Side effects:
648     * None.
649     *
650     *---------------------------------------------------------------------------
651     */
652    
653     char *
654     Tcl_UtfPrev(str, start)
655     CONST char *str; /* The current location in the string. */
656     CONST char *start; /* Pointer to the beginning of the
657     * string, to avoid going backwards too
658     * far. */
659     {
660     CONST char *look;
661     int i, byte;
662    
663     str--;
664     look = str;
665     for (i = 0; i < TCL_UTF_MAX; i++) {
666     if (look < start) {
667     if (str < start) {
668     str = start;
669     }
670     break;
671     }
672     byte = *((unsigned char *) look);
673     if (byte < 0x80) {
674     break;
675     }
676     if (byte >= 0xC0) {
677     if (totalBytes[byte] != i + 1) {
678     break;
679     }
680     return (char *) look;
681     }
682     look--;
683     }
684     return (char *) str;
685     }
686    
687     /*
688     *---------------------------------------------------------------------------
689     *
690     * Tcl_UniCharAtIndex --
691     *
692     * Returns the Unicode character represented at the specified
693     * character (not byte) position in the UTF-8 string.
694     *
695     * Results:
696     * As above.
697     *
698     * Side effects:
699     * None.
700     *
701     *---------------------------------------------------------------------------
702     */
703    
704     Tcl_UniChar
705     Tcl_UniCharAtIndex(src, index)
706     register CONST char *src; /* The UTF-8 string to dereference. */
707     register int index; /* The position of the desired character. */
708     {
709     Tcl_UniChar ch;
710    
711     while (index >= 0) {
712     index--;
713     src += Tcl_UtfToUniChar(src, &ch);
714     }
715     return ch;
716     }
717    
718     /*
719     *---------------------------------------------------------------------------
720     *
721     * Tcl_UtfAtIndex --
722     *
723     * Returns a pointer to the specified character (not byte) position
724     * in the UTF-8 string.
725     *
726     * Results:
727     * As above.
728     *
729     * Side effects:
730     * None.
731     *
732     *---------------------------------------------------------------------------
733     */
734    
735     char *
736     Tcl_UtfAtIndex(src, index)
737     register CONST char *src; /* The UTF-8 string. */
738     register int index; /* The position of the desired character. */
739     {
740     Tcl_UniChar ch;
741    
742     while (index > 0) {
743     index--;
744     src += Tcl_UtfToUniChar(src, &ch);
745     }
746     return (char *) src;
747     }
748    
749     /*
750     *---------------------------------------------------------------------------
751     *
752     * Tcl_UtfBackslash --
753     *
754     * Figure out how to handle a backslash sequence.
755     *
756     * Results:
757     * Stores the bytes represented by the backslash sequence in dst and
758     * returns the number of bytes written to dst. At most TCL_UTF_MAX
759     * bytes are written to dst; dst must have been large enough to accept
760     * those bytes. If readPtr isn't NULL then it is filled in with a
761     * count of the number of bytes in the backslash sequence.
762     *
763     * Side effects:
764     * The maximum number of bytes it takes to represent a Unicode
765     * character in UTF-8 is guaranteed to be less than the number of
766     * bytes used to express the backslash sequence that represents
767     * that Unicode character. If the target buffer into which the
768     * caller is going to store the bytes that represent the Unicode
769     * character is at least as large as the source buffer from which
770     * the backslashed sequence was extracted, no buffer overruns should
771     * occur.
772     *
773     *---------------------------------------------------------------------------
774     */
775    
776     int
777     Tcl_UtfBackslash(src, readPtr, dst)
778     CONST char *src; /* Points to the backslash character of
779     * a backslash sequence. */
780     int *readPtr; /* Fill in with number of characters read
781     * from src, unless NULL. */
782     char *dst; /* Filled with the bytes represented by the
783     * backslash sequence. */
784     {
785     register CONST char *p = src+1;
786     int result, count, n;
787     char buf[TCL_UTF_MAX];
788    
789     if (dst == NULL) {
790     dst = buf;
791     }
792    
793     count = 2;
794     switch (*p) {
795     /*
796     * Note: in the conversions below, use absolute values (e.g.,
797     * 0xa) rather than symbolic values (e.g. \n) that get converted
798     * by the compiler. It's possible that compilers on some
799     * platforms will do the symbolic conversions differently, which
800     * could result in non-portable Tcl scripts.
801     */
802    
803     case 'a':
804     result = 0x7;
805     break;
806     case 'b':
807     result = 0x8;
808     break;
809     case 'f':
810     result = 0xc;
811     break;
812     case 'n':
813     result = 0xa;
814     break;
815     case 'r':
816     result = 0xd;
817     break;
818     case 't':
819     result = 0x9;
820     break;
821     case 'v':
822     result = 0xb;
823     break;
824     case 'x':
825     if (isxdigit(UCHAR(p[1]))) { /* INTL: digit */
826     char *end;
827    
828     result = (unsigned char) strtoul(p+1, &end, 16);
829     count = end - src;
830     } else {
831     count = 2;
832     result = 'x';
833     }
834     break;
835     case 'u':
836     result = 0;
837     for (count = 0; count < 4; count++) {
838     p++;
839     if (!isxdigit(UCHAR(*p))) { /* INTL: digit */
840     break;
841     }
842     n = *p - '0';
843     if (n > 9) {
844     n = n + '0' + 10 - 'A';
845     }
846     if (n > 16) {
847     n = n + 'A' - 'a';
848     }
849     result = (result << 4) + n;
850     }
851     if (count == 0) {
852     result = 'u';
853     }
854     count += 2;
855     break;
856    
857     case '\n':
858     do {
859     p++;
860     } while ((*p == ' ') || (*p == '\t'));
861     result = ' ';
862     count = p - src;
863     break;
864     case 0:
865     result = '\\';
866     count = 1;
867     break;
868     default:
869     /*
870     * Check for an octal number \oo?o?
871     */
872     if (isdigit(UCHAR(*p)) && (UCHAR(*p) < '8')) { /* INTL: digit */
873     result = (unsigned char)(*p - '0');
874     p++;
875     if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
876     break;
877     }
878     count = 3;
879     result = (unsigned char)((result << 3) + (*p - '0'));
880     p++;
881     if (!isdigit(UCHAR(*p)) || (UCHAR(*p) >= '8')) { /* INTL: digit */
882     break;
883     }
884     count = 4;
885     result = (unsigned char)((result << 3) + (*p - '0'));
886     break;
887     }
888     result = *p;
889     count = 2;
890     break;
891     }
892    
893     if (readPtr != NULL) {
894     *readPtr = count;
895     }
896     return Tcl_UniCharToUtf(result, dst);
897     }
898    
899     /*
900     *----------------------------------------------------------------------
901     *
902     * Tcl_UtfToUpper --
903     *
904     * Convert lowercase characters to uppercase characters in a UTF
905     * string in place. The conversion may shrink the UTF string.
906     *
907     * Results:
908     * Returns the number of bytes in the resulting string
909     * excluding the trailing null.
910     *
911     * Side effects:
912     * Writes a terminating null after the last converted character.
913     *
914     *----------------------------------------------------------------------
915     */
916    
917     int
918     Tcl_UtfToUpper(str)
919     char *str; /* String to convert in place. */
920     {
921     Tcl_UniChar ch, upChar;
922     char *src, *dst;
923     int bytes;
924    
925     /*
926     * Iterate over the string until we hit the terminating null.
927     */
928    
929     src = dst = str;
930     while (*src) {
931     bytes = Tcl_UtfToUniChar(src, &ch);
932     upChar = Tcl_UniCharToUpper(ch);
933    
934     /*
935     * To keep badly formed Utf strings from getting inflated by
936     * the conversion (thereby causing a segfault), only copy the
937     * upper case char to dst if its size is <= the original char.
938     */
939    
940     if (bytes < UtfCount(upChar)) {
941     memcpy(dst, src, (size_t) bytes);
942     dst += bytes;
943     } else {
944     dst += Tcl_UniCharToUtf(upChar, dst);
945     }
946     src += bytes;
947     }
948     *dst = '\0';
949     return (dst - str);
950     }
951    
952     /*
953     *----------------------------------------------------------------------
954     *
955     * Tcl_UtfToLower --
956     *
957     * Convert uppercase characters to lowercase characters in a UTF
958     * string in place. The conversion may shrink the UTF string.
959     *
960     * Results:
961     * Returns the number of bytes in the resulting string
962     * excluding the trailing null.
963     *
964     * Side effects:
965     * Writes a terminating null after the last converted character.
966     *
967     *----------------------------------------------------------------------
968     */
969    
970     int
971     Tcl_UtfToLower(str)
972     char *str; /* String to convert in place. */
973     {
974     Tcl_UniChar ch, lowChar;
975     char *src, *dst;
976     int bytes;
977    
978     /*
979     * Iterate over the string until we hit the terminating null.
980     */
981    
982     src = dst = str;
983     while (*src) {
984     bytes = Tcl_UtfToUniChar(src, &ch);
985     lowChar = Tcl_UniCharToLower(ch);
986    
987     /*
988     * To keep badly formed Utf strings from getting inflated by
989     * the conversion (thereby causing a segfault), only copy the
990     * lower case char to dst if its size is <= the original char.
991     */
992    
993     if (bytes < UtfCount(lowChar)) {
994     memcpy(dst, src, (size_t) bytes);
995     dst += bytes;
996     } else {
997     dst += Tcl_UniCharToUtf(lowChar, dst);
998     }
999     src += bytes;
1000     }
1001     *dst = '\0';
1002     return (dst - str);
1003     }
1004    
1005     /*
1006     *----------------------------------------------------------------------
1007     *
1008     * Tcl_UtfToTitle --
1009     *
1010     * Changes the first character of a UTF string to title case or
1011     * uppercase and the rest of the string to lowercase. The
1012     * conversion happens in place and may shrink the UTF string.
1013     *
1014     * Results:
1015     * Returns the number of bytes in the resulting string
1016     * excluding the trailing null.
1017     *
1018     * Side effects:
1019     * Writes a terminating null after the last converted character.
1020     *
1021     *----------------------------------------------------------------------
1022     */
1023    
1024     int
1025     Tcl_UtfToTitle(str)
1026     char *str; /* String to convert in place. */
1027     {
1028     Tcl_UniChar ch, titleChar, lowChar;
1029     char *src, *dst;
1030     int bytes;
1031    
1032     /*
1033     * Capitalize the first character and then lowercase the rest of the
1034     * characters until we get to a null.
1035     */
1036    
1037     src = dst = str;
1038    
1039     if (*src) {
1040     bytes = Tcl_UtfToUniChar(src, &ch);
1041     titleChar = Tcl_UniCharToTitle(ch);
1042    
1043     if (bytes < UtfCount(titleChar)) {
1044     memcpy(dst, src, (size_t) bytes);
1045     dst += bytes;
1046     } else {
1047     dst += Tcl_UniCharToUtf(titleChar, dst);
1048     }
1049     src += bytes;
1050     }
1051     while (*src) {
1052     bytes = Tcl_UtfToUniChar(src, &ch);
1053     lowChar = Tcl_UniCharToLower(ch);
1054    
1055     if (bytes < UtfCount(lowChar)) {
1056     memcpy(dst, src, (size_t) bytes);
1057     dst += bytes;
1058     } else {
1059     dst += Tcl_UniCharToUtf(lowChar, dst);
1060     }
1061     src += bytes;
1062     }
1063     *dst = '\0';
1064     return (dst - str);
1065     }
1066    
1067     /*
1068     *----------------------------------------------------------------------
1069     *
1070     * Tcl_UtfNcmp --
1071     *
1072     * Compare at most n UTF chars of string cs to string ct. Both cs
1073     * and ct are assumed to be at least n UTF chars long.
1074     *
1075     * Results:
1076     * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1077     *
1078     * Side effects:
1079     * None.
1080     *
1081     *----------------------------------------------------------------------
1082     */
1083    
1084     int
1085     Tcl_UtfNcmp(cs, ct, n)
1086     CONST char *cs; /* UTF string to compare to ct. */
1087     CONST char *ct; /* UTF string cs is compared to. */
1088     unsigned long n; /* Number of UTF chars to compare. */
1089     {
1090     Tcl_UniChar ch1, ch2;
1091     /*
1092     * Another approach that should work is:
1093     * return memcmp(cs, ct, (unsigned) (Tcl_UtfAtIndex(cs, n) - cs));
1094     * That assumes that ct is a properly formed UTF, so we will just
1095     * be comparing the bytes that compromise those strings to the
1096     * char length n.
1097     */
1098     while (n-- > 0) {
1099     /*
1100     * n must be interpreted as chars, not bytes.
1101     * This should be called only when both strings are of
1102     * at least n chars long (no need for \0 check)
1103     */
1104     cs += Tcl_UtfToUniChar(cs, &ch1);
1105     ct += Tcl_UtfToUniChar(ct, &ch2);
1106     if (ch1 != ch2) {
1107     return (ch1 - ch2);
1108     }
1109     }
1110     return 0;
1111     }
1112    
1113     /*
1114     *----------------------------------------------------------------------
1115     *
1116     * Tcl_UtfNcasecmp --
1117     *
1118     * Compare at most n UTF chars of string cs to string ct case
1119     * insensitive. Both cs and ct are assumed to be at least n
1120     * UTF chars long.
1121     *
1122     * Results:
1123     * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1124     *
1125     * Side effects:
1126     * None.
1127     *
1128     *----------------------------------------------------------------------
1129     */
1130    
1131     int
1132     Tcl_UtfNcasecmp(cs, ct, n)
1133     CONST char *cs; /* UTF string to compare to ct. */
1134     CONST char *ct; /* UTF string cs is compared to. */
1135     unsigned long n; /* Number of UTF chars to compare. */
1136     {
1137     Tcl_UniChar ch1, ch2;
1138     while (n-- > 0) {
1139     /*
1140     * n must be interpreted as chars, not bytes.
1141     * This should be called only when both strings are of
1142     * at least n chars long (no need for \0 check)
1143     */
1144     cs += Tcl_UtfToUniChar(cs, &ch1);
1145     ct += Tcl_UtfToUniChar(ct, &ch2);
1146     if (ch1 != ch2) {
1147     ch1 = Tcl_UniCharToLower(ch1);
1148     ch2 = Tcl_UniCharToLower(ch2);
1149     if (ch1 != ch2) {
1150     return (ch1 - ch2);
1151     }
1152     }
1153     }
1154     return 0;
1155     }
1156    
1157     /*
1158     *----------------------------------------------------------------------
1159     *
1160     * Tcl_UniCharToUpper --
1161     *
1162     * Compute the uppercase equivalent of the given Unicode character.
1163     *
1164     * Results:
1165     * Returns the uppercase Unicode character.
1166     *
1167     * Side effects:
1168     * None.
1169     *
1170     *----------------------------------------------------------------------
1171     */
1172    
1173     Tcl_UniChar
1174     Tcl_UniCharToUpper(ch)
1175     int ch; /* Unicode character to convert. */
1176     {
1177     int info = GetUniCharInfo(ch);
1178    
1179     if (GetCaseType(info) & 0x04) {
1180     return (Tcl_UniChar) (ch - GetDelta(info));
1181     } else {
1182     return ch;
1183     }
1184     }
1185    
1186     /*
1187     *----------------------------------------------------------------------
1188     *
1189     * Tcl_UniCharToLower --
1190     *
1191     * Compute the lowercase equivalent of the given Unicode character.
1192     *
1193     * Results:
1194     * Returns the lowercase Unicode character.
1195     *
1196     * Side effects:
1197     * None.
1198     *
1199     *----------------------------------------------------------------------
1200     */
1201    
1202     Tcl_UniChar
1203     Tcl_UniCharToLower(ch)
1204     int ch; /* Unicode character to convert. */
1205     {
1206     int info = GetUniCharInfo(ch);
1207    
1208     if (GetCaseType(info) & 0x02) {
1209     return (Tcl_UniChar) (ch + GetDelta(info));
1210     } else {
1211     return ch;
1212     }
1213     }
1214    
1215     /*
1216     *----------------------------------------------------------------------
1217     *
1218     * Tcl_UniCharToTitle --
1219     *
1220     * Compute the titlecase equivalent of the given Unicode character.
1221     *
1222     * Results:
1223     * Returns the titlecase Unicode character.
1224     *
1225     * Side effects:
1226     * None.
1227     *
1228     *----------------------------------------------------------------------
1229     */
1230    
1231     Tcl_UniChar
1232     Tcl_UniCharToTitle(ch)
1233     int ch; /* Unicode character to convert. */
1234     {
1235     int info = GetUniCharInfo(ch);
1236     int mode = GetCaseType(info);
1237    
1238     if (mode & 0x1) {
1239     /*
1240     * Subtract or add one depending on the original case.
1241     */
1242    
1243     return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1244     } else if (mode == 0x4) {
1245     return (Tcl_UniChar) (ch - GetDelta(info));
1246     } else {
1247     return ch;
1248     }
1249     }
1250    
1251     /*
1252     *----------------------------------------------------------------------
1253     *
1254     * Tcl_UniCharLen --
1255     *
1256     * Find the length of a UniChar string. The str input must be null
1257     * terminated.
1258     *
1259     * Results:
1260     * Returns the length of str in UniChars (not bytes).
1261     *
1262     * Side effects:
1263     * None.
1264     *
1265     *----------------------------------------------------------------------
1266     */
1267    
1268     int
1269     Tcl_UniCharLen(str)
1270     Tcl_UniChar *str; /* Unicode string to find length of. */
1271     {
1272     int len = 0;
1273    
1274     while (*str != '\0') {
1275     len++;
1276     str++;
1277     }
1278     return len;
1279     }
1280    
1281     /*
1282     *----------------------------------------------------------------------
1283     *
1284     * Tcl_UniCharNcmp --
1285     *
1286     * Compare at most n unichars of string cs to string ct. Both cs
1287     * and ct are assumed to be at least n unichars long.
1288     *
1289     * Results:
1290     * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1291     *
1292     * Side effects:
1293     * None.
1294     *
1295     *----------------------------------------------------------------------
1296     */
1297    
1298     int
1299     Tcl_UniCharNcmp(cs, ct, n)
1300     CONST Tcl_UniChar *cs; /* Unicode string to compare to ct. */
1301     CONST Tcl_UniChar *ct; /* Unicode string cs is compared to. */
1302     unsigned long n; /* Number of unichars to compare. */
1303     {
1304     for ( ; n != 0; n--, cs++, ct++) {
1305     if (*cs != *ct) {
1306     return *cs - *ct;
1307     }
1308     if (*cs == '\0') {
1309     break;
1310     }
1311     }
1312     return 0;
1313     }
1314    
1315     /*
1316     *----------------------------------------------------------------------
1317     *
1318     * Tcl_UniCharIsAlnum --
1319     *
1320     * Test if a character is an alphanumeric Unicode character.
1321     *
1322     * Results:
1323     * Returns 1 if character is alphanumeric.
1324     *
1325     * Side effects:
1326     * None.
1327     *
1328     *----------------------------------------------------------------------
1329     */
1330    
1331     int
1332     Tcl_UniCharIsAlnum(ch)
1333     int ch; /* Unicode character to test. */
1334     {
1335     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1336    
1337     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1338     }
1339    
1340     /*
1341     *----------------------------------------------------------------------
1342     *
1343     * Tcl_UniCharIsAlpha --
1344     *
1345     * Test if a character is an alphabetic Unicode character.
1346     *
1347     * Results:
1348     * Returns 1 if character is alphabetic.
1349     *
1350     * Side effects:
1351     * None.
1352     *
1353     *----------------------------------------------------------------------
1354     */
1355    
1356     int
1357     Tcl_UniCharIsAlpha(ch)
1358     int ch; /* Unicode character to test. */
1359     {
1360     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1361     return ((ALPHA_BITS >> category) & 1);
1362     }
1363    
1364     /*
1365     *----------------------------------------------------------------------
1366     *
1367     * Tcl_UniCharIsControl --
1368     *
1369     * Test if a character is a Unicode control character.
1370     *
1371     * Results:
1372     * Returns non-zero if character is a control.
1373     *
1374     * Side effects:
1375     * None.
1376     *
1377     *----------------------------------------------------------------------
1378     */
1379    
1380     int
1381     Tcl_UniCharIsControl(ch)
1382     int ch; /* Unicode character to test. */
1383     {
1384     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1385     }
1386    
1387     /*
1388     *----------------------------------------------------------------------
1389     *
1390     * Tcl_UniCharIsDigit --
1391     *
1392     * Test if a character is a numeric Unicode character.
1393     *
1394     * Results:
1395     * Returns non-zero if character is a digit.
1396     *
1397     * Side effects:
1398     * None.
1399     *
1400     *----------------------------------------------------------------------
1401     */
1402    
1403     int
1404     Tcl_UniCharIsDigit(ch)
1405     int ch; /* Unicode character to test. */
1406     {
1407     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK)
1408     == DECIMAL_DIGIT_NUMBER);
1409     }
1410    
1411     /*
1412     *----------------------------------------------------------------------
1413     *
1414     * Tcl_UniCharIsGraph --
1415     *
1416     * Test if a character is any Unicode print character except space.
1417     *
1418     * Results:
1419     * Returns non-zero if character is printable, but not space.
1420     *
1421     * Side effects:
1422     * None.
1423     *
1424     *----------------------------------------------------------------------
1425     */
1426    
1427     int
1428     Tcl_UniCharIsGraph(ch)
1429     int ch; /* Unicode character to test. */
1430     {
1431     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1432     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1433     }
1434    
1435     /*
1436     *----------------------------------------------------------------------
1437     *
1438     * Tcl_UniCharIsLower --
1439     *
1440     * Test if a character is a lowercase Unicode character.
1441     *
1442     * Results:
1443     * Returns non-zero if character is lowercase.
1444     *
1445     * Side effects:
1446     * None.
1447     *
1448     *----------------------------------------------------------------------
1449     */
1450    
1451     int
1452     Tcl_UniCharIsLower(ch)
1453     int ch; /* Unicode character to test. */
1454     {
1455     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1456     }
1457    
1458     /*
1459     *----------------------------------------------------------------------
1460     *
1461     * Tcl_UniCharIsPrint --
1462     *
1463     * Test if a character is a Unicode print character.
1464     *
1465     * Results:
1466     * Returns non-zero if character is printable.
1467     *
1468     * Side effects:
1469     * None.
1470     *
1471     *----------------------------------------------------------------------
1472     */
1473    
1474     int
1475     Tcl_UniCharIsPrint(ch)
1476     int ch; /* Unicode character to test. */
1477     {
1478     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1479     return ((PRINT_BITS >> category) & 1);
1480     }
1481    
1482     /*
1483     *----------------------------------------------------------------------
1484     *
1485     * Tcl_UniCharIsPunct --
1486     *
1487     * Test if a character is a Unicode punctuation character.
1488     *
1489     * Results:
1490     * Returns non-zero if character is punct.
1491     *
1492     * Side effects:
1493     * None.
1494     *
1495     *----------------------------------------------------------------------
1496     */
1497    
1498     int
1499     Tcl_UniCharIsPunct(ch)
1500     int ch; /* Unicode character to test. */
1501     {
1502     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1503     return ((PUNCT_BITS >> category) & 1);
1504     }
1505    
1506     /*
1507     *----------------------------------------------------------------------
1508     *
1509     * Tcl_UniCharIsSpace --
1510     *
1511     * Test if a character is a whitespace Unicode character.
1512     *
1513     * Results:
1514     * Returns non-zero if character is a space.
1515     *
1516     * Side effects:
1517     * None.
1518     *
1519     *----------------------------------------------------------------------
1520     */
1521    
1522     int
1523     Tcl_UniCharIsSpace(ch)
1524     int ch; /* Unicode character to test. */
1525     {
1526     register int category;
1527    
1528     /*
1529     * If the character is within the first 127 characters, just use the
1530     * standard C function, otherwise consult the Unicode table.
1531     */
1532    
1533     if (ch < 0x80) {
1534     return isspace(UCHAR(ch)); /* INTL: ISO space */
1535     } else {
1536     category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1537     return ((SPACE_BITS >> category) & 1);
1538     }
1539     }
1540    
1541     /*
1542     *----------------------------------------------------------------------
1543     *
1544     * Tcl_UniCharIsUpper --
1545     *
1546     * Test if a character is a uppercase Unicode character.
1547     *
1548     * Results:
1549     * Returns non-zero if character is uppercase.
1550     *
1551     * Side effects:
1552     * None.
1553     *
1554     *----------------------------------------------------------------------
1555     */
1556    
1557     int
1558     Tcl_UniCharIsUpper(ch)
1559     int ch; /* Unicode character to test. */
1560     {
1561     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1562     }
1563    
1564     /*
1565     *----------------------------------------------------------------------
1566     *
1567     * Tcl_UniCharIsWordChar --
1568     *
1569     * Test if a character is alphanumeric or a connector punctuation
1570     * mark.
1571     *
1572     * Results:
1573     * Returns 1 if character is a word character.
1574     *
1575     * Side effects:
1576     * None.
1577     *
1578     *----------------------------------------------------------------------
1579     */
1580    
1581     int
1582     Tcl_UniCharIsWordChar(ch)
1583     int ch; /* Unicode character to test. */
1584     {
1585     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1586    
1587     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1588     }
1589    
1590    
1591     /* $History: tclutf.c $
1592     *
1593     * ***************** Version 1 *****************
1594     * User: Dtashley Date: 1/02/01 Time: 1:05a
1595     * Created in $/IjuScripter, IjuConsole/Source/Tcl Base
1596     * Initial check-in.
1597     */
1598    
1599     /* End of TCL_UTF.C */

dashley@gmail.com
ViewVC Help
Powered by ViewVC 1.1.25