Check-in [e831429a4d]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix [a814ee5bbd] - string compare/equal -length for long strings
Timelines: family | ancestors | descendants | both | trunk | main
Files: files | file ages | folders
SHA3-256: e831429a4d9cbc8a54833891c0d00b6ecddb391271ca872aa37757ebc0258d63
User & Date: apnadkarni 2023-04-24 16:28:21.363
Context
2023-05-02
16:43
Merge 9.0 check-in: d66ed253e8 user: jan.nijtmans tags: tip-626
2023-04-24
16:49
Fix [9369f83649] - binary format/scan for big data check-in: 06ceffb2f3 user: apnadkarni tags: trunk, main
16:28
Fix [a814ee5bbd] - string compare/equal -length for long strings check-in: e831429a4d user: apnadkarni tags: trunk, main
15:51
Adapt signature of TclCompileScript() to fit better after TIP 660. check-in: 815e08b6d1 user: dgp tags: trunk, main
Changes
Unified Diff Ignore Whitespace Patch
Changes to generic/tclCmdMZ.c.
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
    /*
     * Remember to keep code here in some sync with the byte-compiled versions
     * in tclExecute.c (INST_STR_EQ, INST_STR_NEQ and INST_STR_CMP as well as
     * the expr string comparison in INST_EQ/INST_NEQ/INST_LT/...).
     */

    const char *string2;
    int i, match, nocase = 0, reqlength = -1;
    Tcl_Size length;

    if (objc < 3 || objc > 6) {
    str_cmp_args:
	Tcl_WrongNumArgs(interp, 1, objv,
		"?-nocase? ?-length int? string1 string2");
	return TCL_ERROR;
    }

    for (i = 1; i < objc-2; i++) {
	string2 = Tcl_GetStringFromObj(objv[i], &length);
	if ((length > 1) && !strncmp(string2, "-nocase", length)) {
	    nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string2, "-length", length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetIntFromObj(interp, objv[i], &reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase or -length",
		    string2));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",







|
|


















|







2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
    /*
     * Remember to keep code here in some sync with the byte-compiled versions
     * in tclExecute.c (INST_STR_EQ, INST_STR_NEQ and INST_STR_CMP as well as
     * the expr string comparison in INST_EQ/INST_NEQ/INST_LT/...).
     */

    const char *string2;
    int i, match, nocase = 0;
    Tcl_Size length, reqlength = -1;

    if (objc < 3 || objc > 6) {
    str_cmp_args:
	Tcl_WrongNumArgs(interp, 1, objv,
		"?-nocase? ?-length int? string1 string2");
	return TCL_ERROR;
    }

    for (i = 1; i < objc-2; i++) {
	string2 = Tcl_GetStringFromObj(objv[i], &length);
	if ((length > 1) && !strncmp(string2, "-nocase", length)) {
	    nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string2, "-length", length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetSizeIntFromObj(interp, objv[i], &reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase or -length",
		    string2));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",
2713
2714
2715
2716
2717
2718
2719
2720

2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
{
    /*
     * Remember to keep code here in some sync with the byte-compiled versions
     * in tclExecute.c (INST_STR_EQ, INST_STR_NEQ and INST_STR_CMP as well as
     * the expr string comparison in INST_EQ/INST_NEQ/INST_LT/...).
     */

    int match, nocase, reqlength, status;


    status = TclStringCmpOpts(interp, objc, objv, &nocase, &reqlength);
    if (status != TCL_OK) {
	return status;
    }

    objv += objc-2;
    match = TclStringCmp(objv[0], objv[1], 0, nocase, reqlength);
    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(match));
    return TCL_OK;
}

int
TclStringCmpOpts(
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[],	/* Argument objects. */
    int *nocase,
    int *reqlength)
{
    int i;
    Tcl_Size length;
    const char *string;

    *reqlength = -1;
    *nocase = 0;







|
>


















|







2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
{
    /*
     * Remember to keep code here in some sync with the byte-compiled versions
     * in tclExecute.c (INST_STR_EQ, INST_STR_NEQ and INST_STR_CMP as well as
     * the expr string comparison in INST_EQ/INST_NEQ/INST_LT/...).
     */

    int match, nocase, status;
    Tcl_Size reqlength;

    status = TclStringCmpOpts(interp, objc, objv, &nocase, &reqlength);
    if (status != TCL_OK) {
	return status;
    }

    objv += objc-2;
    match = TclStringCmp(objv[0], objv[1], 0, nocase, reqlength);
    Tcl_SetObjResult(interp, Tcl_NewWideIntObj(match));
    return TCL_OK;
}

int
TclStringCmpOpts(
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[],	/* Argument objects. */
    int *nocase,
    Tcl_Size *reqlength)
{
    int i;
    Tcl_Size length;
    const char *string;

    *reqlength = -1;
    *nocase = 0;
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
	    *nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string, "-length", length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetIntFromObj(interp, objv[i], reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase or -length",
		    string));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",







|







2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
	    *nocase = 1;
	} else if ((length > 1)
		&& !strncmp(string, "-length", length)) {
	    if (i+1 >= objc-2) {
		goto str_cmp_args;
	    }
	    i++;
	    if (TclGetSizeIntFromObj(interp, objv[i], reqlength) != TCL_OK) {
		return TCL_ERROR;
	    }
	} else {
	    Tcl_SetObjResult(interp, Tcl_ObjPrintf(
		    "bad option \"%s\": must be -nocase or -length",
		    string));
	    Tcl_SetErrorCode(interp, "TCL", "LOOKUP", "INDEX", "option",
Changes to generic/tclInt.h.
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
MODULE_SCOPE void *	TclStackRealloc(Tcl_Interp *interp, void *ptr,
			    Tcl_Size numBytes);
typedef int (*memCmpFn_t)(const void*, const void*, size_t);
MODULE_SCOPE int	TclStringCmp(Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr,
			    int checkEq, int nocase, Tcl_Size reqlength);
MODULE_SCOPE int	TclStringCmpOpts(Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[], int *nocase,
			    int *reqlength);
MODULE_SCOPE int	TclStringMatch(const char *str, Tcl_Size strLen,
			    const char *pattern, int ptnLen, int flags);
MODULE_SCOPE int	TclStringMatchObj(Tcl_Obj *stringObj,
			    Tcl_Obj *patternObj, int flags);
MODULE_SCOPE void	TclSubstCompile(Tcl_Interp *interp, const char *bytes,
			    Tcl_Size numBytes, int flags, Tcl_Size line,
			    struct CompileEnv *envPtr);







|







3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
MODULE_SCOPE void *	TclStackRealloc(Tcl_Interp *interp, void *ptr,
			    Tcl_Size numBytes);
typedef int (*memCmpFn_t)(const void*, const void*, size_t);
MODULE_SCOPE int	TclStringCmp(Tcl_Obj *value1Ptr, Tcl_Obj *value2Ptr,
			    int checkEq, int nocase, Tcl_Size reqlength);
MODULE_SCOPE int	TclStringCmpOpts(Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[], int *nocase,
			    Tcl_Size *reqlength);
MODULE_SCOPE int	TclStringMatch(const char *str, Tcl_Size strLen,
			    const char *pattern, int ptnLen, int flags);
MODULE_SCOPE int	TclStringMatchObj(Tcl_Obj *stringObj,
			    Tcl_Obj *patternObj, int flags);
MODULE_SCOPE void	TclSubstCompile(Tcl_Interp *interp, const char *bytes,
			    Tcl_Size numBytes, int flags, Tcl_Size line,
			    struct CompileEnv *envPtr);
Changes to generic/tclStringObj.c.
3521
3522
3523
3524
3525
3526
3527

3528
3529
3530
3531
3532
3533
3534
    int empty, match;
    Tcl_Size length, s1len = 0, s2len = 0;
    memCmpFn_t memCmpFn;

    if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
	/*
	 * Always match at 0 chars of if it is the same obj.

	 */
	match = 0;
    } else {
	if (!nocase && TclIsPureByteArray(value1Ptr)
		&& TclIsPureByteArray(value2Ptr)) {
	    /*
	     * Use binary versions of comparisons since that won't cause undue







>







3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
    int empty, match;
    Tcl_Size length, s1len = 0, s2len = 0;
    memCmpFn_t memCmpFn;

    if ((reqlength == 0) || (value1Ptr == value2Ptr)) {
	/*
	 * Always match at 0 chars of if it is the same obj.
	 * Note: as documented reqlength negative means it is ignored
	 */
	match = 0;
    } else {
	if (!nocase && TclIsPureByteArray(value1Ptr)
		&& TclIsPureByteArray(value2Ptr)) {
	    /*
	     * Use binary versions of comparisons since that won't cause undue
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
#else
			    checkEq
#endif
			    ) {
			memCmpFn = memcmp;
			s1len *= sizeof(Tcl_UniChar);
			s2len *= sizeof(Tcl_UniChar);
			if (reqlength != TCL_INDEX_NONE) {
			    reqlength *= sizeof(Tcl_UniChar);
			}
		    } else {
			memCmpFn = (memCmpFn_t) TclUniCharNcmp;
		    }
		}
	    }







|







3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
#else
			    checkEq
#endif
			    ) {
			memCmpFn = memcmp;
			s1len *= sizeof(Tcl_UniChar);
			s2len *= sizeof(Tcl_UniChar);
			if (reqlength > 0) {
			    reqlength *= sizeof(Tcl_UniChar);
			}
		    } else {
			memCmpFn = (memCmpFn_t) TclUniCharNcmp;
		    }
		}
	    }
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
		    match = 0;
		    goto matchdone;
		}
	    } else {
		s1 = Tcl_GetStringFromObj(value1Ptr, &s1len);
		s2 = Tcl_GetStringFromObj(value2Ptr, &s2len);
	    }
	    if (!nocase && checkEq && reqlength == TCL_INDEX_NONE) {
		/*
		 * When we have equal-length we can check only for
		 * (in)equality. We can use memcmp in all (n)eq cases because
		 * we don't need to worry about lexical LE/BE variance.
		 */

		memCmpFn = memcmp;
	    } else {
		/*
		 * As a catch-all we will work with UTF-8. We cannot use
		 * memcmp() as that is unsafe with any string containing NUL
		 * (\xC0\x80 in Tcl's utf rep). We can use the more efficient
		 * TclpUtfNcmp2 if we are case-sensitive and no specific
		 * length was requested.
		 */

		if ((reqlength == TCL_INDEX_NONE) && !nocase) {
		    memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
		} else {
		    s1len = Tcl_NumUtfChars(s1, s1len);
		    s2len = Tcl_NumUtfChars(s2, s2len);
		    memCmpFn = (memCmpFn_t)
			    (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp);
		}
	    }
	}

	/* At this point s1len, s2len, and reqlength should by now have been
	 * adjusted so that they are all in the units expected by the selected
	 * comparison function.
	 */
	length = (s1len < s2len) ? s1len : s2len;
	if (reqlength == TCL_INDEX_NONE) {
	    /*
	     * The requested length is negative, so ignore it by setting it
	     * to length + 1 to correct the match var.
	     */

	    reqlength = length + 1;
	} else if (reqlength > 0 && reqlength < length) {
	    length = reqlength;
	}

	if (checkEq && reqlength == TCL_INDEX_NONE && (s1len != s2len)) {
	    match = 1;		/* This will be reversed below. */
	} else {
	    /*
	     * The comparison function should compare up to the minimum byte
	     * length only.
	     */








|
















|















|










|







3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
		    match = 0;
		    goto matchdone;
		}
	    } else {
		s1 = Tcl_GetStringFromObj(value1Ptr, &s1len);
		s2 = Tcl_GetStringFromObj(value2Ptr, &s2len);
	    }
	    if (!nocase && checkEq && reqlength < 0) {
		/*
		 * When we have equal-length we can check only for
		 * (in)equality. We can use memcmp in all (n)eq cases because
		 * we don't need to worry about lexical LE/BE variance.
		 */

		memCmpFn = memcmp;
	    } else {
		/*
		 * As a catch-all we will work with UTF-8. We cannot use
		 * memcmp() as that is unsafe with any string containing NUL
		 * (\xC0\x80 in Tcl's utf rep). We can use the more efficient
		 * TclpUtfNcmp2 if we are case-sensitive and no specific
		 * length was requested.
		 */

		if ((reqlength < 0) && !nocase) {
		    memCmpFn = (memCmpFn_t) TclpUtfNcmp2;
		} else {
		    s1len = Tcl_NumUtfChars(s1, s1len);
		    s2len = Tcl_NumUtfChars(s2, s2len);
		    memCmpFn = (memCmpFn_t)
			    (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp);
		}
	    }
	}

	/* At this point s1len, s2len, and reqlength should by now have been
	 * adjusted so that they are all in the units expected by the selected
	 * comparison function.
	 */
	length = (s1len < s2len) ? s1len : s2len;
	if (reqlength < 0) {
	    /*
	     * The requested length is negative, so ignore it by setting it
	     * to length + 1 to correct the match var.
	     */

	    reqlength = length + 1;
	} else if (reqlength > 0 && reqlength < length) {
	    length = reqlength;
	}

	if (checkEq && reqlength < 0 && (s1len != s2len)) {
	    match = 1;		/* This will be reversed below. */
	} else {
	    /*
	     * The comparison function should compare up to the minimum byte
	     * length only.
	     */

Changes to tests/bigdata.test.
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

163
164
165
166
167
168
169
170
}

#
# string compare/equal
bigtestRO string-equal/compare-bigdata-1 "string compare/equal equal strings" {0 1} -body {
    list [string compare $s1 $s2] [string equal $s1 $s2]
} -setup {
    set len [expr {$::bigLengths(intmax)+1}]
    set s1 [bigString $len]
    set s2 [bigString $len]; # Use separate string to avoid Tcl_Obj * being same
} -cleanup {
    bigClean
}
bigtestRO string-equal/compare-bigdata-2 "string compare/equal -length unequal strings" {-1 0 0 1} -body {
    # Also tests lengths do not wrap
    set result {}
    lappend result [string compare $s1 $s2]
    lappend result [string equal $s1 $s2]
    # Check lengths > UINT_MAX
    # Also that lengths do not truncate to sizeof(int)
    lappend result [string compare -length $len $s1 $s2]
    lappend result [string equal -length $len $s1 $s2]
} -setup {
    set len [expr {$::bigLengths(uintmax)+2}]
    set s1 [bigString $len]
    set s2 [bigString $len $len]; # Differs in last char
} -cleanup {
    bigClean

} -constraints bug-a814ee5bbd

#
# string first
bigtestRO string-first-bigdata-1 "string first > INT_MAX" {2147483648 -1 2147483650 1} -body {
    list \
        [string first X $s] \
        [string first Y $s] \







<
|
|










|
|

<
|
|


>
|







135
136
137
138
139
140
141

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

157
158
159
160
161
162
163
164
165
166
167
168
169
}

#
# string compare/equal
bigtestRO string-equal/compare-bigdata-1 "string compare/equal equal strings" {0 1} -body {
    list [string compare $s1 $s2] [string equal $s1 $s2]
} -setup {

    set s1 [bigString 0x100000000]
    set s2 [bigString 0x100000000]; # Separate so Tcl_Obj is not the same
} -cleanup {
    bigClean
}
bigtestRO string-equal/compare-bigdata-2 "string compare/equal -length unequal strings" {-1 0 0 1} -body {
    # Also tests lengths do not wrap
    set result {}
    lappend result [string compare $s1 $s2]
    lappend result [string equal $s1 $s2]
    # Check lengths > UINT_MAX
    # Also that lengths do not truncate to sizeof(int)
    lappend result [string compare -length 0x100000000 $s1 $s2]
    lappend result [string equal -length 0x100000000 $s1 $s2]
} -setup {

    set s1 [bigString 0x100000001]
    set s2 [bigString 0x100000001 0x100000000]; # Differs in last char
} -cleanup {
    bigClean
}
# -constraints bug-a814ee5bbd

#
# string first
bigtestRO string-first-bigdata-1 "string first > INT_MAX" {2147483648 -1 2147483650 1} -body {
    list \
        [string first X $s] \
        [string first Y $s] \