Check-in [62362d0caa]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:merge 8.5
Timelines: family | ancestors | descendants | both | core-8-6-branch
Files: files | file ages | folders
SHA3-256: 62362d0caadda237b39c64cd152badb1de784de69c1e41599f26f36006f0532e
User & Date: dgp 2020-05-06 21:42:47.884
Context
2020-05-07
10:09
Optimize Tcl_UtfToUniCharDString() check-in: 806e1e868c user: jan.nijtmans tags: core-8-6-branch
2020-05-06
21:52
merge 8.6 check-in: 4d08cde908 user: dgp tags: core-8-branch
21:42
merge 8.5 check-in: 62362d0caa user: dgp tags: core-8-6-branch
21:08
Tighten optimization in Tcl_NumUtfChars. Explain in comments. check-in: dabb52db36 user: dgp tags: core-8-5-branch
19:31
merge 8.5 check-in: 01956c0799 user: dgp tags: core-8-6-branch
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tclUtf.c.
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604

605
606
607
608
609
610



611


612







613

614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636




637
638
639
640
641
642
643
 *
 *---------------------------------------------------------------------------
 */

int
Tcl_NumUtfChars(
    const char *src,	/* The UTF-8 string to measure. */
    int length)			/* The length of the string in bytes, or -1
				 * for strlen(string). */
{
    Tcl_UniChar ch = 0;
    int i = 0;

    /*
     * The separate implementations are faster.
     *
     * Since this is a time-sensitive function, we also do the check for the
     * single-byte char case specially.
     */

    if (length < 0) {

	while (*src != '\0') {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (i < 0) i = INT_MAX; /* Bug [2738427] */
    } else {



	const char *endPtr = src + length - TCL_UTF_MAX;










	while (src < endPtr) {

#if TCL_UTF_MAX < 4
	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
		/* treat F0 - F4 as single character */
		ch = 0;
		src++;
	    } else
#endif
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	endPtr += TCL_UTF_MAX;
	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
#if TCL_UTF_MAX < 4
	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
		/* treat F0 - F4 as single character */
		ch = 0;
		src++;
	    } else
#endif
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (src < endPtr) {




	    i += endPtr - src;
	}
    }
    return i;
}

/*







|
|




<
<
<
<
<
<
<

>
|



<

>
>
>
|
>
>

>
>
>
>
>
>
>
|
>










|












>
>
>
>







584
585
586
587
588
589
590
591
592
593
594
595
596







597
598
599
600
601
602

603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
 *
 *---------------------------------------------------------------------------
 */

int
Tcl_NumUtfChars(
    const char *src,	/* The UTF-8 string to measure. */
    int length)		/* The length of the string in bytes, or -1
			 * for strlen(string). */
{
    Tcl_UniChar ch = 0;
    int i = 0;








    if (length < 0) {
	/* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
	while ((*src != '\0') && (i < INT_MAX)) {
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}

    } else {
	/* Will return value between 0 and length. No overflow checks. */

	/* Pointer to the end of string. Never read endPtr[0] */
	const char *endPtr = src + length;
	/* Pointer to breakpoint in scan where optimization is lost */
	const char *optPtr = endPtr - TCL_UTF_MAX + 1;

	/*
	 * Optimize away the call in this loop. Justified because...
	 *	when (src < optPtr), (endPtr - src) > (endPtr - optPtr)
	 * By initialization above (endPtr - optPtr) = TCL_UTF_MAX - 1
	 * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
	 * Tcl_UtfCharComplete we know will cause return of 1.
	 */
	while ((src < optPtr)
		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
#if TCL_UTF_MAX < 4
	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
		/* treat F0 - F4 as single character */
		ch = 0;
		src++;
	    } else
#endif
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	/* Loop over the remaining string where call must happen */
	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
#if TCL_UTF_MAX < 4
	    if (((unsigned)UCHAR(*src) - 0xF0) < 5) {
		/* treat F0 - F4 as single character */
		ch = 0;
		src++;
	    } else
#endif
	    src += TclUtfToUniChar(src, &ch);
	    i++;
	}
	if (src < endPtr) {
	    /*
	     * String ends in an incomplete UTF-8 sequence.
	     * Count every byte in it.
	     */
	    i += endPtr - src;
	}
    }
    return i;
}

/*