Index: generic/tclBinary.c ================================================================== --- generic/tclBinary.c +++ generic/tclBinary.c @@ -52,22 +52,16 @@ /* * Prototypes for local procedures defined in this file: */ -static void DupByteArrayInternalRep(Tcl_Obj *srcPtr, - Tcl_Obj *copyPtr); static int FormatNumber(Tcl_Interp *interp, int type, Tcl_Obj *src, unsigned char **cursorPtr); -static void FreeByteArrayInternalRep(Tcl_Obj *objPtr); static int GetFormatSpec(const char **formatPtr, char *cmdPtr, int *countPtr, int *flagsPtr); static Tcl_Obj * ScanNumber(unsigned char *buffer, int type, int flags, Tcl_HashTable **numberCachePtr); -static int SetByteArrayFromAny(Tcl_Interp *interp, - Tcl_Obj *objPtr); -static void UpdateStringOfByteArray(Tcl_Obj *listPtr); static void DeleteScanNumberCache(Tcl_HashTable *numberCachePtr); static int NeedReversing(int format); static void CopyNumber(const void *from, void *to, unsigned length, int type); /* Binary ensemble commands */ @@ -149,432 +143,10 @@ * representation of each character in the String, casting it to a byte by * truncating the upper 8 bits, and then storing the byte in the ByteArray. * Converting from ByteArray to String and back to ByteArray is not lossy, but * converting an arbitrary String to a ByteArray may be. */ - -const Tcl_ObjType tclByteArrayType = { - "bytearray", - FreeByteArrayInternalRep, - DupByteArrayInternalRep, - UpdateStringOfByteArray, - SetByteArrayFromAny -}; - -/* - * The following structure is the internal rep for a ByteArray object. Keeps - * track of how much memory has been used and how much has been allocated for - * the byte array to enable growing and shrinking of the ByteArray object with - * fewer mallocs. - */ - -typedef struct ByteArray { - int used; /* The number of bytes used in the byte - * array. */ - int allocated; /* The amount of space actually allocated - * minus 1 byte. */ - unsigned char bytes[1]; /* The array of bytes. The actual size of this - * field depends on the 'allocated' field - * above. */ -} ByteArray; - -#define BYTEARRAY_SIZE(len) \ - ((unsigned) (TclOffset(ByteArray, bytes) + (len))) -#define GET_BYTEARRAY(objPtr) \ - ((ByteArray *) (objPtr)->internalRep.otherValuePtr) -#define SET_BYTEARRAY(objPtr, baPtr) \ - (objPtr)->internalRep.otherValuePtr = (void *) (baPtr) - -/* - *---------------------------------------------------------------------- - * - * Tcl_NewByteArrayObj -- - * - * This procedure is creates a new ByteArray object and initializes it - * from the given array of bytes. - * - * Results: - * The newly create object is returned. This object will have no initial - * string representation. The returned object has a ref count of 0. - * - * Side effects: - * Memory allocated for new object and copy of byte array argument. - * - *---------------------------------------------------------------------- - */ - -#undef Tcl_NewByteArrayObj - -Tcl_Obj * -Tcl_NewByteArrayObj( - const unsigned char *bytes, /* The array of bytes used to initialize the - * new object. */ - int length) /* Length of the array of bytes, which must be - * >= 0. */ -{ -#ifdef TCL_MEM_DEBUG - return Tcl_DbNewByteArrayObj(bytes, length, "unknown", 0); -#else /* if not TCL_MEM_DEBUG */ - Tcl_Obj *objPtr; - - TclNewObj(objPtr); - Tcl_SetByteArrayObj(objPtr, bytes, length); - return objPtr; -#endif /* TCL_MEM_DEBUG */ -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_DbNewByteArrayObj -- - * - * This procedure is normally called when debugging: i.e., when - * TCL_MEM_DEBUG is defined. It is the same as the Tcl_NewByteArrayObj - * above except that it calls Tcl_DbCkalloc directly with the file name - * and line number from its caller. This simplifies debugging since then - * the [memory active] command will report the correct file name and line - * number when reporting objects that haven't been freed. - * - * When TCL_MEM_DEBUG is not defined, this procedure just returns the - * result of calling Tcl_NewByteArrayObj. - * - * Results: - * The newly create object is returned. This object will have no initial - * string representation. The returned object has a ref count of 0. - * - * Side effects: - * Memory allocated for new object and copy of byte array argument. - * - *---------------------------------------------------------------------- - */ - -Tcl_Obj * -Tcl_DbNewByteArrayObj( - const unsigned char *bytes, /* The array of bytes used to initialize the - * new object. */ - int length, /* Length of the array of bytes, which must be - * >= 0. */ - const char *file, /* The name of the source file calling this - * procedure; used for debugging. */ - int line) /* Line number in the source file; used for - * debugging. */ -{ -#ifdef TCL_MEM_DEBUG - Tcl_Obj *objPtr; - - TclDbNewObj(objPtr, file, line); - Tcl_SetByteArrayObj(objPtr, bytes, length); - return objPtr; -#else /* if not TCL_MEM_DEBUG */ - return Tcl_NewByteArrayObj(bytes, length); -#endif /* TCL_MEM_DEBUG */ -} - -/* - *--------------------------------------------------------------------------- - * - * Tcl_SetByteArrayObj -- - * - * Modify an object to be a ByteArray object and to have the specified - * array of bytes as its value. - * - * Results: - * None. - * - * Side effects: - * The object's old string rep and internal rep is freed. Memory - * allocated for copy of byte array argument. - * - *---------------------------------------------------------------------- - */ - -void -Tcl_SetByteArrayObj( - Tcl_Obj *objPtr, /* Object to initialize as a ByteArray. */ - const unsigned char *bytes, /* The array of bytes to use as the new - value. May be NULL even if length > 0. */ - int length) /* Length of the array of bytes, which must - be >= 0. */ -{ - ByteArray *byteArrayPtr; - - if (Tcl_IsShared(objPtr)) { - Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayObj"); - } - TclFreeIntRep(objPtr); - Tcl_InvalidateStringRep(objPtr); - - length = (length < 0) ? 0 : length; - byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length)); - memset(byteArrayPtr, 0, BYTEARRAY_SIZE(length)); - byteArrayPtr->used = length; - byteArrayPtr->allocated = length; - if (bytes && length) { - memcpy(byteArrayPtr->bytes, bytes, (size_t) length); - } - - objPtr->typePtr = &tclByteArrayType; - SET_BYTEARRAY(objPtr, byteArrayPtr); -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_GetByteArrayFromObj -- - * - * Attempt to get the array of bytes from the Tcl object. If the object - * is not already a ByteArray object, an attempt will be made to convert - * it to one. - * - * Results: - * Pointer to array of bytes representing the ByteArray object. - * - * Side effects: - * Frees old internal rep. Allocates memory for new internal rep. - * - *---------------------------------------------------------------------- - */ - -unsigned char * -Tcl_GetByteArrayFromObj( - Tcl_Obj *objPtr, /* The ByteArray object. */ - int *lengthPtr) /* If non-NULL, filled with length of the - * array of bytes in the ByteArray object. */ -{ - ByteArray *baPtr; - - if (objPtr->typePtr != &tclByteArrayType) { - SetByteArrayFromAny(NULL, objPtr); - } - baPtr = GET_BYTEARRAY(objPtr); - - if (lengthPtr != NULL) { - *lengthPtr = baPtr->used; - } - return (unsigned char *) baPtr->bytes; -} - -/* - *---------------------------------------------------------------------- - * - * Tcl_SetByteArrayLength -- - * - * This procedure changes the length of the byte array for this object. - * Once the caller has set the length of the array, it is acceptable to - * directly modify the bytes in the array up until Tcl_GetStringFromObj() - * has been called on this object. - * - * Results: - * The new byte array of the specified length. - * - * Side effects: - * Allocates enough memory for an array of bytes of the requested size. - * When growing the array, the old array is copied to the new array; new - * bytes are undefined. When shrinking, the old array is truncated to the - * specified length. - * - *---------------------------------------------------------------------- - */ - -unsigned char * -Tcl_SetByteArrayLength( - Tcl_Obj *objPtr, /* The ByteArray object. */ - int length) /* New length for internal byte array. */ -{ - ByteArray *byteArrayPtr; - - if (Tcl_IsShared(objPtr)) { - Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayLength"); - } - if (objPtr->typePtr != &tclByteArrayType) { - SetByteArrayFromAny(NULL, objPtr); - } - - byteArrayPtr = GET_BYTEARRAY(objPtr); - if (length > byteArrayPtr->allocated) { - byteArrayPtr = ckrealloc(byteArrayPtr, BYTEARRAY_SIZE(length)); - byteArrayPtr->allocated = length; - SET_BYTEARRAY(objPtr, byteArrayPtr); - } - Tcl_InvalidateStringRep(objPtr); - byteArrayPtr->used = length; - return byteArrayPtr->bytes; -} - -/* - *---------------------------------------------------------------------- - * - * SetByteArrayFromAny -- - * - * Generate the ByteArray internal rep from the string rep. - * - * Results: - * The return value is always TCL_OK. - * - * Side effects: - * A ByteArray object is stored as the internal rep of objPtr. - * - *---------------------------------------------------------------------- - */ - -static int -SetByteArrayFromAny( - Tcl_Interp *interp, /* Not used. */ - Tcl_Obj *objPtr) /* The object to convert to type ByteArray. */ -{ - int length; - const char *src, *srcEnd; - unsigned char *dst; - ByteArray *byteArrayPtr; - Tcl_UniChar ch; - - if (objPtr->typePtr != &tclByteArrayType) { - src = TclGetStringFromObj(objPtr, &length); - srcEnd = src + length; - - byteArrayPtr = ckalloc(BYTEARRAY_SIZE(length)); - for (dst = byteArrayPtr->bytes; src < srcEnd; ) { - src += Tcl_UtfToUniChar(src, &ch); - *dst++ = UCHAR(ch); - } - - byteArrayPtr->used = dst - byteArrayPtr->bytes; - byteArrayPtr->allocated = length; - - TclFreeIntRep(objPtr); - objPtr->typePtr = &tclByteArrayType; - SET_BYTEARRAY(objPtr, byteArrayPtr); - } - return TCL_OK; -} - -/* - *---------------------------------------------------------------------- - * - * FreeByteArrayInternalRep -- - * - * Deallocate the storage associated with a ByteArray data object's - * internal representation. - * - * Results: - * None. - * - * Side effects: - * Frees memory. - * - *---------------------------------------------------------------------- - */ - -static void -FreeByteArrayInternalRep( - Tcl_Obj *objPtr) /* Object with internal rep to free. */ -{ - ckfree(GET_BYTEARRAY(objPtr)); - objPtr->typePtr = NULL; -} - -/* - *---------------------------------------------------------------------- - * - * DupByteArrayInternalRep -- - * - * Initialize the internal representation of a ByteArray Tcl_Obj to a - * copy of the internal representation of an existing ByteArray object. - * - * Results: - * None. - * - * Side effects: - * Allocates memory. - * - *---------------------------------------------------------------------- - */ - -static void -DupByteArrayInternalRep( - Tcl_Obj *srcPtr, /* Object with internal rep to copy. */ - Tcl_Obj *copyPtr) /* Object with internal rep to set. */ -{ - int length; - ByteArray *srcArrayPtr, *copyArrayPtr; - - srcArrayPtr = GET_BYTEARRAY(srcPtr); - length = srcArrayPtr->used; - - copyArrayPtr = ckalloc(BYTEARRAY_SIZE(length)); - copyArrayPtr->used = length; - copyArrayPtr->allocated = length; - memcpy(copyArrayPtr->bytes, srcArrayPtr->bytes, (size_t) length); - SET_BYTEARRAY(copyPtr, copyArrayPtr); - - copyPtr->typePtr = &tclByteArrayType; -} - -/* - *---------------------------------------------------------------------- - * - * UpdateStringOfByteArray -- - * - * Update the string representation for a ByteArray data object. Note: - * This procedure does not invalidate an existing old string rep so - * storage will be lost if this has not already been done. - * - * Results: - * None. - * - * Side effects: - * The object's string is set to a valid string that results from the - * ByteArray-to-string conversion. - * - * The object becomes a string object -- the internal rep is discarded - * and the typePtr becomes NULL. - * - *---------------------------------------------------------------------- - */ - -static void -UpdateStringOfByteArray( - Tcl_Obj *objPtr) /* ByteArray object whose string rep to - * update. */ -{ - int i, length, size; - unsigned char *src; - char *dst; - ByteArray *byteArrayPtr; - - byteArrayPtr = GET_BYTEARRAY(objPtr); - src = byteArrayPtr->bytes; - length = byteArrayPtr->used; - - /* - * How much space will string rep need? - */ - - size = length; - for (i = 0; i < length && size >= 0; i++) { - if ((src[i] == 0) || (src[i] > 127)) { - size++; - } - } - if (size < 0) { - Tcl_Panic("max size for a Tcl value (%d bytes) exceeded", INT_MAX); - } - - dst = ckalloc(size + 1); - objPtr->bytes = dst; - objPtr->length = size; - - if (size == length) { - memcpy(dst, src, (size_t) size); - dst[size] = '\0'; - } else { - for (i = 0; i < length; i++) { - dst += Tcl_UniCharToUtf(src[i], dst); - } - *dst = '\0'; - } -} /* *---------------------------------------------------------------------- * * TclAppendBytesToByteArray -- @@ -1317,23 +889,11 @@ } size--; } } - /* - * Have to do this #ifdef-fery because (as part of defining - * Tcl_NewByteArrayObj) we removed the #def that hides this stuff - * normally. If this code ever gets copied to another file, it - * should be changed back to the simpler version. - */ - -#ifdef TCL_MEM_DEBUG - valuePtr = Tcl_DbNewByteArrayObj(src, size, __FILE__, __LINE__); -#else valuePtr = Tcl_NewByteArrayObj(src, size); -#endif /* TCL_MEM_DEBUG */ - resultPtr = Tcl_ObjSetVar2(interp, objv[arg], NULL, valuePtr, TCL_LEAVE_ERR_MSG); arg++; if (resultPtr == NULL) { DeleteScanNumberCache(numberCachePtr); Index: generic/tclExecute.c ================================================================== --- generic/tclExecute.c +++ generic/tclExecute.c @@ -4520,48 +4520,36 @@ O2S(objResultPtr))); NEXT_INST_F(1, 2, 1); case INST_STR_LEN: valuePtr = OBJ_AT_TOS; + length = Tcl_GetCharLength(valuePtr); + TclNewIntObj(objResultPtr, length); TRACE(("%.20s => %d\n", O2S(valuePtr), length)); NEXT_INST_F(1, 1, 1); case INST_STR_INDEX: value2Ptr = OBJ_AT_TOS; valuePtr = OBJ_UNDER_TOS; /* - * Get char length to calulate what 'end' means. + * Get Unicode char length to calulate what 'end' means. */ length = Tcl_GetCharLength(valuePtr); - if (TclGetIntForIndexM(interp, value2Ptr, length-1, &index)!=TCL_OK) { - goto gotError; + + result = TclGetIntForIndexM(interp, value2Ptr, length - 1, &index); + if (result != TCL_OK) { + goto checkForCatch; } - if ((index < 0) || (index >= length)) { - TclNewObj(objResultPtr); - } else if (TclIsPureByteArray(valuePtr)) { - objResultPtr = Tcl_NewByteArrayObj( - Tcl_GetByteArrayFromObj(valuePtr, &length)+index, 1); - } else if (valuePtr->bytes && length == valuePtr->length) { - objResultPtr = Tcl_NewStringObj((const char *) - valuePtr->bytes+index, 1); + if ((index >= 0) && (index < length)) { + objResultPtr = Tcl_GetRange(valuePtr, index, index); } else { - char buf[TCL_UTF_MAX]; - Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); - - /* - * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) - * but creating the object as a string seems to be faster in - * practical use. - */ - - length = Tcl_UniCharToUtf(ch, buf); - objResultPtr = Tcl_NewStringObj(buf, length); + TclNewObj(objResultPtr); } TRACE(("%.20s %.20s => %s\n", O2S(valuePtr), O2S(value2Ptr), O2S(objResultPtr))); NEXT_INST_F(1, 2, 1); Index: generic/tclIO.c ================================================================== --- generic/tclIO.c +++ generic/tclIO.c @@ -5713,18 +5713,10 @@ if (appendFlag == 0) { if (encoding == NULL) { Tcl_SetByteArrayLength(objPtr, 0); } else { Tcl_SetObjLength(objPtr, 0); - - /* - * We're going to access objPtr->bytes directly, so we must ensure - * that this is actually a string object (otherwise it might have - * been pure Unicode). - */ - - TclGetString(objPtr); } offset = 0; } else { if (encoding == NULL) { Tcl_GetByteArrayFromObj(objPtr, &offset); Index: generic/tclInt.h ================================================================== --- generic/tclInt.h +++ generic/tclInt.h @@ -3094,13 +3094,16 @@ MODULE_SCOPE void TclSetCmdNameObj(Tcl_Interp *interp, Tcl_Obj *objPtr, Command *cmdPtr); MODULE_SCOPE void TclSetDuplicateObj(Tcl_Obj *dupPtr, Tcl_Obj *objPtr); MODULE_SCOPE void TclSetProcessGlobalValue(ProcessGlobalValue *pgvPtr, Tcl_Obj *newValue, Tcl_Encoding encoding); +MODULE_SCOPE Tcl_UniChar * TclSetUnicodeLength(Tcl_Obj *objPtr, int length); MODULE_SCOPE void TclSignalExitThread(Tcl_ThreadId id, int result); MODULE_SCOPE void * TclStackRealloc(Tcl_Interp *interp, void *ptr, int numBytes); +MODULE_SCOPE int TclStringCompare(Tcl_Obj *obj1Ptr, Tcl_Obj *obj2Ptr, + int reqlength, int nocase, int equal); MODULE_SCOPE int TclStringMatch(const char *str, int strLen, const char *pattern, int ptnLen, int flags); MODULE_SCOPE int TclStringMatchObj(Tcl_Obj *stringObj, Tcl_Obj *patternObj, int flags); MODULE_SCOPE Tcl_Obj * TclStringObjReverse(Tcl_Obj *objPtr); Index: generic/tclStringObj.c ================================================================== --- generic/tclStringObj.c +++ generic/tclStringObj.c @@ -60,19 +60,12 @@ const char *bytes, int numBytes); static void AppendUtfToUtfRep(Tcl_Obj *objPtr, const char *bytes, int numBytes); static void DupStringInternalRep(Tcl_Obj *objPtr, Tcl_Obj *copyPtr); -static int ExtendStringRepWithUnicode(Tcl_Obj *objPtr, - const Tcl_UniChar *unicode, int numChars); -static void ExtendUnicodeRepWithString(Tcl_Obj *objPtr, - const char *bytes, int numBytes, - int numAppendChars); -static void FillUnicodeRep(Tcl_Obj *objPtr); -static void FreeStringInternalRep(Tcl_Obj *objPtr); -static void GrowStringBuffer(Tcl_Obj *objPtr, int needed, int flag); -static void GrowUnicodeBuffer(Tcl_Obj *objPtr, int needed); +static int SetByteArrayFromAny(Tcl_Interp *interp, + Tcl_Obj *objPtr); static int SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); static void SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); static int UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); @@ -88,10 +81,44 @@ DupStringInternalRep, /* dupIntRepProc */ UpdateStringOfString, /* updateStringProc */ SetStringFromAny /* setFromAnyProc */ }; +/* + * Keep the old bytearray type for backward compatibility with code + * that uses it to distinguish binary data. + */ + +const Tcl_ObjType tclByteArrayType = { + "bytearray", + FreeStringInternalRep, /* freeIntRepPro */ + DupStringInternalRep, /* dupIntRepProc */ + UpdateStringOfString, /* updateStringProc */ + SetByteArrayFromAny /* setFromAnyProc */ +}; + +/* + * To know what shortcuts can be taken with an object, it is important + * to know if all characters are within certain ranges. + * This enum defines the ranges used. + */ + +typedef enum { + Range_Ascii, /* Characters within range 1-127 */ + Range_Byte, /* Characters within range 0-255 */ + Range_Unicode /* Characters within range 0-65535 */ +} CharacterRange; + +/* + * An object can hold up to one fixed width storage of the string. + * This enum defines which is currently present. + */ + +typedef enum { + Storage_None, Storage_Byte, Storage_Unicode +} StorageType; + /* * The following structure is the internal rep for a String object. It keeps * track of how much memory has been used and how much has been allocated for * the Unicode and UTF string to enable growing and shrinking of the UTF and * Unicode reps of the String object with fewer mallocs. To optimize string @@ -104,41 +131,59 @@ * tcl.h, but do not do that unless you are sure what you're doing! */ typedef struct String { int numChars; /* The number of chars in the string. -1 means - * this value has not been calculated. >= 0 - * means that there is a valid Unicode rep, or - * that the number of UTF bytes == the number - * of chars. */ - int allocated; /* The amount of space actually allocated for + * this value has not been calculated, and that + * the range field is invalid. */ + size_t allocated; /* The amount of space actually allocated for * the UTF string (minus 1 byte for the * termination char). */ - int maxChars; /* Max number of chars that can fit in the - * space allocated for the unicode array. */ - int hasUnicode; /* Boolean determining whether the string has - * a Unicode representation. */ - Tcl_UniChar unicode[1]; /* The array of Unicode chars. The actual size - * of this field depends on the 'maxChars' - * field above. */ + size_t uallocated; /* The amount of space actually allocated for + * the fixed width string (minus 2 bytes for + * the termination char if it is Unicode). */ + CharacterRange range; /* The range that all chars in the string are + * within. */ + StorageType storageType; /* Determines if there is a fixed width + * representation stored. */ + union { /* The array of fixed-width chars. The */ + unsigned char bytes[4]; /* actual size of this field depends on the */ + Tcl_UniChar unicode[2]; /* 'uallocated' field above. */ + } data; } String; -#define STRING_MAXCHARS \ - (int)(((size_t)UINT_MAX - sizeof(String))/sizeof(Tcl_UniChar)) -#define STRING_SIZE(numChars) \ - (sizeof(String) + ((numChars) * sizeof(Tcl_UniChar))) -#define stringCheckLimits(numChars) \ - if ((numChars) < 0 || (numChars) > STRING_MAXCHARS) { \ - Tcl_Panic("max length for a Tcl unicode value (%d chars) exceeded", \ - STRING_MAXCHARS); \ - } -#define stringAlloc(numChars) \ - (String *) ckalloc((unsigned) STRING_SIZE(numChars) ) -#define stringRealloc(ptr, numChars) \ - (String *) ckrealloc((ptr), (unsigned) STRING_SIZE(numChars) ) -#define stringAttemptRealloc(ptr, numChars) \ - (String *) attemptckrealloc((ptr), (unsigned) STRING_SIZE(numChars) ) +/* + * Assertions about String struct: + * + * numChars can only be -1 if objPtr->bytes is filled in + * range can only be Range_Ascii if objPtr->bytes is filled in + */ + +#define STRING_UALLOC(numChars) \ + ((numChars) * sizeof(Tcl_UniChar)) +#define STRING_SIZE(numBytes) \ + (sizeof(String) - sizeof(Tcl_UniChar) + (numBytes)) +#define BYTEARRAY_UALLOC(numChars) ((numChars) > 4 ? (numChars) : 4) +#define BYTEARRAY_SIZE(ualloc) ((unsigned) (sizeof(String) - 4 + (ualloc))) +#define STRING_NOMEM(numBytes) \ + (Tcl_Panic("unable to alloc %u bytes", STRING_SIZE(numBytes)), \ + (char *) NULL) +#define stringAlloc(numBytes) \ + (String *) (((numBytes) > INT_MAX - STRING_SIZE(0)) \ + ? STRING_NOMEM(numBytes) \ + : ckalloc((unsigned) STRING_SIZE( \ + (numBytes) ? (numBytes) : sizeof(Tcl_UniChar)) )) +#define stringRealloc(ptr, numBytes) \ + (String *) (((numBytes) > INT_MAX - STRING_SIZE(0)) \ + ? STRING_NOMEM(numBytes) \ + : ckrealloc((char *) ptr, (unsigned) STRING_SIZE( \ + (numBytes) ? (numBytes) : sizeof(Tcl_UniChar)) )) +#define stringAttemptRealloc(ptr, numBytes) \ + (String *) (((numBytes) > INT_MAX - STRING_SIZE(0)) \ + ? NULL \ + : attemptckrealloc((char *) ptr, (unsigned) STRING_SIZE( \ + (numBytes) ? (numBytes) : sizeof(Tcl_UniChar)) )) #define GET_STRING(objPtr) \ ((String *) (objPtr)->internalRep.otherValuePtr) #define SET_STRING(objPtr, stringPtr) \ ((objPtr)->internalRep.otherValuePtr = (void *) (stringPtr)) @@ -445,10 +490,311 @@ } /* *---------------------------------------------------------------------- * + * Tcl_NewByteArrayObj -- + * + * This procedure is creates a new ByteArray object and initializes it + * from the given array of bytes. + * + * Results: + * The newly create object is returned. This object will have no initial + * string representation. The returned object has a ref count of 0. + * + * Side effects: + * Memory allocated for new object and copy of byte array argument. + * + *---------------------------------------------------------------------- + */ + +#ifdef TCL_MEM_DEBUG +#undef Tcl_NewByteArrayObj + +Tcl_Obj * +Tcl_NewByteArrayObj( + const unsigned char *bytes, /* The array of bytes used to initialize the + * new object. */ + int length) /* Length of the array of bytes, which must be + * >= 0. */ +{ + return Tcl_DbNewByteArrayObj(bytes, length, "unknown", 0); +} + +#else /* if not TCL_MEM_DEBUG */ + +Tcl_Obj * +Tcl_NewByteArrayObj( + const unsigned char *bytes, /* The array of bytes used to initialize the + * new object. */ + int length) /* Length of the array of bytes, which must be + * >= 0. */ +{ + Tcl_Obj *objPtr; + + TclNewObj(objPtr); + Tcl_SetByteArrayObj(objPtr, bytes, length); + return objPtr; +} +#endif /* TCL_MEM_DEBUG */ + +/* + *---------------------------------------------------------------------- + * + * Tcl_DbNewByteArrayObj -- + * + * This procedure is normally called when debugging: i.e., when + * TCL_MEM_DEBUG is defined. It is the same as the Tcl_NewByteArrayObj + * above except that it calls Tcl_DbCkalloc directly with the file name + * and line number from its caller. This simplifies debugging since then + * the [memory active] command will report the correct file name and line + * number when reporting objects that haven't been freed. + * + * When TCL_MEM_DEBUG is not defined, this procedure just returns the + * result of calling Tcl_NewByteArrayObj. + * + * Results: + * The newly create object is returned. This object will have no initial + * string representation. The returned object has a ref count of 0. + * + * Side effects: + * Memory allocated for new object and copy of byte array argument. + * + *---------------------------------------------------------------------- + */ + +#ifdef TCL_MEM_DEBUG + +Tcl_Obj * +Tcl_DbNewByteArrayObj( + const unsigned char *bytes, /* The array of bytes used to initialize the + * new object. */ + int length, /* Length of the array of bytes, which must be + * >= 0. */ + const char *file, /* The name of the source file calling this + * procedure; used for debugging. */ + int line) /* Line number in the source file; used for + * debugging. */ +{ + Tcl_Obj *objPtr; + + TclDbNewObj(objPtr, file, line); + Tcl_SetByteArrayObj(objPtr, bytes, length); + return objPtr; +} + +#else /* if not TCL_MEM_DEBUG */ + +Tcl_Obj * +Tcl_DbNewByteArrayObj( + const unsigned char *bytes, /* The array of bytes used to initialize the + * new object. */ + int length, /* Length of the array of bytes, which must be + * >= 0. */ + const char *file, /* The name of the source file calling this + * procedure; used for debugging. */ + int line) /* Line number in the source file; used for + * debugging. */ +{ + return Tcl_NewByteArrayObj(bytes, length); +} +#endif /* TCL_MEM_DEBUG */ + +/* + *--------------------------------------------------------------------------- + * + * Tcl_SetByteArrayObj -- + * + * Modify an object to be a ByteArray object and to have the specified + * array of bytes as its value. + * + * Results: + * None. + * + * Side effects: + * The object's old string rep and internal rep is freed. Memory + * allocated for copy of byte array argument. + * + *---------------------------------------------------------------------- + */ + +void +Tcl_SetByteArrayObj( + Tcl_Obj *objPtr, /* Object to initialize as a ByteArray. */ + const unsigned char *bytes, /* The array of bytes to use as the new + * value. */ + int length) /* Length of the array of bytes, which must be + * >= 0. */ +{ + size_t uallocated; + String *stringPtr; + + if (Tcl_IsShared(objPtr)) { + Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayObj"); + } + + TclFreeIntRep(objPtr); + Tcl_InvalidateStringRep(objPtr); + + uallocated = BYTEARRAY_UALLOC(length); + stringPtr = (String *) ckalloc(BYTEARRAY_SIZE(uallocated)); + stringPtr->numChars = length; + stringPtr->uallocated = uallocated; + stringPtr->storageType = Storage_Byte; + stringPtr->range = Range_Byte; + stringPtr->allocated = 0; + memcpy(stringPtr->data.bytes, bytes, (size_t) length); + + objPtr->typePtr = &tclByteArrayType; + SET_STRING(objPtr, stringPtr); +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_GetByteArrayFromObj -- + * + * Attempt to get the array of bytes from the Tcl object. If the object + * is not already a ByteArray object, an attempt will be made to convert + * it to one. + * + * Results: + * Pointer to array of bytes representing the ByteArray object. + * + * Side effects: + * Frees old internal rep. Allocates memory for new internal rep. + * + *---------------------------------------------------------------------- + */ + +unsigned char * +Tcl_GetByteArrayFromObj( + Tcl_Obj *objPtr, /* The ByteArray object. */ + int *lengthPtr) /* If non-NULL, filled with length of the + * array of bytes in the ByteArray object. */ +{ + String *stringPtr; + + SetByteArrayFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (lengthPtr != NULL) { + *lengthPtr = stringPtr->numChars; + } + return stringPtr->data.bytes; +} + +/* + *---------------------------------------------------------------------- + * + * Tcl_SetByteArrayLength -- + * + * This procedure changes the length of the byte array for this object. + * Once the caller has set the length of the array, it is acceptable to + * directly modify the bytes in the array up until Tcl_GetStringFromObj() + * has been called on this object. + * + * Results: + * The new byte array of the specified length. + * + * Side effects: + * Allocates enough memory for an array of bytes of the requested size. + * When growing the array, the old array is copied to the new array; new + * bytes are undefined. When shrinking, the old array is truncated to the + * specified length. + * + *---------------------------------------------------------------------- + */ + +unsigned char * +Tcl_SetByteArrayLength( + Tcl_Obj *objPtr, /* The ByteArray object. */ + int length) /* New length for internal byte array. */ +{ + size_t uallocated; + String *stringPtr; + + if (Tcl_IsShared(objPtr)) { + Tcl_Panic("%s called with shared object", "Tcl_SetByteArrayLength"); + } + if (objPtr->typePtr != &tclByteArrayType) { + SetByteArrayFromAny(NULL, objPtr); + } + + stringPtr = GET_STRING(objPtr); + uallocated = BYTEARRAY_UALLOC(length); + if (uallocated > stringPtr->uallocated) { + stringPtr = (String *) ckrealloc( + (char *) stringPtr, BYTEARRAY_SIZE(uallocated)); + stringPtr->uallocated = uallocated; + SET_STRING(objPtr, stringPtr); + } + Tcl_InvalidateStringRep(objPtr); + stringPtr->numChars = length; + stringPtr->range = Range_Byte; + return stringPtr->data.bytes; +} + +/* + *---------------------------------------------------------------------- + * + * TclSetUnicodeLength -- + * + * This procedure changes the length of the unicode array for this object. + * Once the caller has set the length of the array, it is acceptable to + * directly modify the elements in the array up until Tcl_GetStringFromObj() + * has been called on this object. + * + * Results: + * The new unicode array of the specified length. + * + * Side effects: + * Allocates enough memory for an array of unichars of the requested size. + * When growing the array, the old array is copied to the new array; new + * bytes are undefined. When shrinking, the old array is truncated to the + * specified length. + * + *---------------------------------------------------------------------- + */ + +Tcl_UniChar * +TclSetUnicodeLength( + Tcl_Obj *objPtr, /* The Unicode object. */ + int length) /* New length for internal unicode array. */ +{ + size_t uallocated; + String *stringPtr; + + if (Tcl_IsShared(objPtr)) { + Tcl_Panic("%s called with shared object", "Tcl_SetUnicodeLength"); + } + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (stringPtr->storageType != Storage_Unicode) { + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + + uallocated = STRING_UALLOC(length); + if (uallocated > stringPtr->uallocated) { + stringPtr = (String *) ckrealloc( + (char *) stringPtr, STRING_SIZE(uallocated)); + stringPtr->uallocated = uallocated; + SET_STRING(objPtr, stringPtr); + } + Tcl_InvalidateStringRep(objPtr); + stringPtr->numChars = length; + stringPtr->range = Range_Unicode; + stringPtr->data.unicode[length] = 0; + return stringPtr->data.unicode; +} + +/* + *---------------------------------------------------------------------- + * * Tcl_GetCharLength -- * * Get the length of the Unicode string from the Tcl object. * * Results: @@ -489,26 +835,45 @@ SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); numChars = stringPtr->numChars; /* - * If numChars is unknown, compute it. - */ - - if (numChars == -1) { - TclNumUtfChars(numChars, objPtr->bytes, objPtr->length); - stringPtr->numChars = numChars; - -#if COMPAT - if (numChars < objPtr->length) { - /* - * Since we've just computed the number of chars, and not all UTF - * chars are 1-byte long, go ahead and populate the unicode - * string. - */ - - FillUnicodeRep(objPtr); + * If numChars is unknown, then calculate the number of characaters while + * populating the Unicode string. + */ + + if (stringPtr->numChars == -1) { + register int i = objPtr->length; + register unsigned char *str = (unsigned char *) objPtr->bytes; + + /* + * This is a speed sensitive function, so run specially over the + * string to count continuous ascii characters before resorting to the + * Tcl_NumUtfChars call. This is a long form of: + stringPtr->numChars = Tcl_NumUtfChars(objPtr->bytes,objPtr->length); + * + * TODO: Consider macro-izing this. + */ + + while (i && (*str < 0xC0)) { + i--; + str++; + } + stringPtr->numChars = objPtr->length - i; + if (i) { + stringPtr->numChars += Tcl_NumUtfChars(objPtr->bytes + + (objPtr->length - i), i); + } + + if (stringPtr->numChars == objPtr->length) { + /* + * All UTF chars are 1-byte long. + */ + + stringPtr->range = Range_Ascii; + } else { + stringPtr->range = Range_Unicode; } #endif } return numChars; } @@ -555,23 +920,30 @@ */ SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->hasUnicode == 0) { + if (stringPtr->numChars == -1) { + Tcl_GetCharLength(objPtr); + } + + if (stringPtr->range == Range_Ascii) { /* - * If numChars is unknown, compute it. + * All of the characters in the Utf string are 1 byte chars. We get + * the Utf string and convert the index'th byte to a Unicode character. */ - if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); - } - if (stringPtr->numChars == objPtr->length) { - return (Tcl_UniChar) objPtr->bytes[index]; - } - FillUnicodeRep(objPtr); - stringPtr = GET_STRING(objPtr); + unichar = (Tcl_UniChar) objPtr->bytes[index]; + } else if ((stringPtr->storageType == Storage_Byte) && + (stringPtr->range == Range_Byte)) { + unichar = (Tcl_UniChar) stringPtr->data.bytes[index]; + } else { + if (stringPtr->storageType != Storage_Unicode) { + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + unichar = stringPtr->data.unicode[index]; } return stringPtr->unicode[index]; } /* @@ -596,11 +968,32 @@ Tcl_UniChar * Tcl_GetUnicode( Tcl_Obj *objPtr) /* The object to find the unicode string * for. */ { - return Tcl_GetUnicodeFromObj(objPtr, NULL); + String *stringPtr; + + SetStringFromAny(NULL, objPtr); + stringPtr = GET_STRING(objPtr); + + if (stringPtr->storageType != Storage_Unicode) { + /* + * Since this function must return a unicode string, and one has + * not yet been stored, force the Unicode to be calculated and stored + * now. + */ + + FillUnicodeRep(objPtr); + + /* + * We need to fetch the pointer again because we have just reallocated + * the structure to make room for the Unicode data. + */ + + stringPtr = GET_STRING(objPtr); + } + return stringPtr->data.unicode; } /* *---------------------------------------------------------------------- * @@ -631,19 +1024,25 @@ String *stringPtr; SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->hasUnicode == 0) { + if (stringPtr->storageType != Storage_Unicode) { + /* + * Since this function must return a unicode string, and one has + * not yet been stored, force the Unicode to be calculated and stored + * now. + */ + FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } if (lengthPtr != NULL) { *lengthPtr = stringPtr->numChars; } - return stringPtr->unicode; + return stringPtr->data.unicode; } /* *---------------------------------------------------------------------- * @@ -689,35 +1088,46 @@ */ SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->hasUnicode == 0) { - /* - * If numChars is unknown, compute it. - */ - - if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); - } - if (stringPtr->numChars == objPtr->length) { - newObjPtr = Tcl_NewStringObj(objPtr->bytes + first, last-first+1); - - /* - * Since we know the char length of the result, store it. - */ - - SetStringFromAny(NULL, newObjPtr); - stringPtr = GET_STRING(newObjPtr); - stringPtr->numChars = newObjPtr->length; - return newObjPtr; - } - FillUnicodeRep(objPtr); - stringPtr = GET_STRING(objPtr); - } - - return Tcl_NewUnicodeObj(stringPtr->unicode + first, last-first+1); + if (stringPtr->numChars == -1) { + Tcl_GetCharLength(objPtr); + } + + if (stringPtr->range == Range_Ascii) { + char *str = TclGetString(objPtr); + + /* + * All of the characters in the Utf string are 1 byte chars. Create a + * new string object containing the specified range of chars. + */ + + newObjPtr = Tcl_NewStringObj(str+first, last-first+1); + + /* + * Since we know the new string only has 1-byte chars, we can set it's + * numChars field. + */ + + SetStringFromAny(NULL, newObjPtr); + stringPtr = GET_STRING(newObjPtr); + stringPtr->numChars = last-first+1; + stringPtr->range = Range_Ascii; + } else if ((stringPtr->storageType == Storage_Byte) && + (stringPtr->range == Range_Byte)) { + newObjPtr = Tcl_NewByteArrayObj(stringPtr->data.bytes + first, + last-first+1); + } else { + if (stringPtr->storageType != Storage_Unicode) { + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + newObjPtr = Tcl_NewUnicodeObj(stringPtr->data.unicode + first, + last-first+1); + } + return newObjPtr; } /* *---------------------------------------------------------------------- * @@ -783,11 +1193,11 @@ * None. * * Side effects: * If the size of objPtr's string representation is greater than length, * then it is reduced to length and a new terminating null byte is stored - * in the strength. If the length of the string representation is greater + * in the string. If the length of the string representation is greater * than length, the storage space is reallocated to the given length; a * null byte is stored at the end, but other bytes past the end of the * original string representation are undefined. The object's internal * representation is changed to "expendable string". * @@ -819,63 +1229,44 @@ if (objPtr->bytes && objPtr->length == length) { return; } - SetStringFromAny(NULL, objPtr); - stringPtr = GET_STRING(objPtr); + if (objPtr->bytes == NULL) { + UpdateStringOfString(objPtr); + } - if (objPtr->bytes != NULL) { + if (length > (int) stringPtr->allocated) { /* * Change length of an existing string rep. */ - if (length > stringPtr->allocated) { - /* - * Need to enlarge the buffer. - */ - if (objPtr->bytes == tclEmptyStringRep) { - objPtr->bytes = ckalloc(length + 1); - } else { - objPtr->bytes = ckrealloc(objPtr->bytes, length + 1); - } - stringPtr->allocated = length; - } - - objPtr->length = length; - objPtr->bytes[length] = 0; - - /* - * Invalidate the unicode data. - */ - - stringPtr->numChars = -1; - stringPtr->hasUnicode = 0; - } else { - /* - * Changing length of pure unicode string. - */ - - stringCheckLimits(length); - if (length > stringPtr->maxChars) { - stringPtr = stringRealloc(stringPtr, length); - SET_STRING(objPtr, stringPtr); - stringPtr->maxChars = length; - } - - /* - * Mark the new end of the unicode string - */ - - stringPtr->numChars = length; - stringPtr->unicode[length] = 0; - stringPtr->hasUnicode = 1; - + + if (objPtr->bytes != tclEmptyStringRep) { + objPtr->bytes = ckrealloc((char *)objPtr->bytes, + (unsigned)(length+1)); + } else { + objPtr->bytes = ckalloc((unsigned) (length+1)); + } + stringPtr->allocated = length; + } + + objPtr->length = length; + if (objPtr->bytes != tclEmptyStringRep) { /* * Can only get here when objPtr->bytes == NULL. No need to invalidate * the string rep. */ + + objPtr->bytes[length] = 0; } + + /* + * Invalidate the unicode data. + */ + + stringPtr->numChars = -1; + stringPtr->storageType = Storage_None; } /* *---------------------------------------------------------------------- * @@ -925,11 +1316,17 @@ } SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (objPtr->bytes != NULL) { + if (objPtr->bytes == NULL) { + UpdateStringOfString(objPtr); + } + + if (length > (int) stringPtr->allocated) { + char *new; + /* * Change length of an existing string rep. */ if (length > stringPtr->allocated) { /* @@ -936,61 +1333,41 @@ * Need to enlarge the buffer. */ char *newBytes; - if (objPtr->bytes == tclEmptyStringRep) { - newBytes = attemptckalloc(length + 1); - } else { - newBytes = attemptckrealloc(objPtr->bytes, length + 1); - } - if (newBytes == NULL) { - return 0; - } - objPtr->bytes = newBytes; - stringPtr->allocated = length; - } - - objPtr->length = length; - objPtr->bytes[length] = 0; - - /* - * Invalidate the unicode data. - */ - - stringPtr->numChars = -1; - stringPtr->hasUnicode = 0; - } else { - /* - * Changing length of pure unicode string. - */ - - if (length > STRING_MAXCHARS) { - return 0; - } - if (length > stringPtr->maxChars) { - stringPtr = stringAttemptRealloc(stringPtr, length); - if (stringPtr == NULL) { - return 0; - } - SET_STRING(objPtr, stringPtr); - stringPtr->maxChars = length; - } - - /* - * Mark the new end of the unicode string. - */ - - stringPtr->unicode[length] = 0; - stringPtr->numChars = length; - stringPtr->hasUnicode = 1; - - /* - * Can only get here when objPtr->bytes == NULL. No need to invalidate - * the string rep. - */ - } + if (objPtr->bytes != tclEmptyStringRep) { + new = attemptckrealloc(objPtr->bytes, (unsigned)(length+1)); + if (new == NULL) { + return 0; + } + } else { + new = attemptckalloc((unsigned) (length+1)); + if (new == NULL) { + return 0; + } + } + objPtr->bytes = new; + stringPtr->allocated = length; + } + + objPtr->length = length; + if (objPtr->bytes != tclEmptyStringRep) { + /* + * Ensure the string is NULL-terminated. + */ + + objPtr->bytes[length] = 0; + } + + /* + * Invalidate the unicode data. + */ + + stringPtr->numChars = -1; + stringPtr->storageType = Storage_None; + return 1; } /* *--------------------------------------------------------------------------- @@ -1059,15 +1436,17 @@ stringCheckLimits(numChars); stringPtr = stringAlloc(numChars); SET_STRING(objPtr, stringPtr); objPtr->typePtr = &tclStringType; - stringPtr->maxChars = numChars; - memcpy(stringPtr->unicode, unicode, numChars * sizeof(Tcl_UniChar)); - stringPtr->unicode[numChars] = 0; stringPtr->numChars = numChars; - stringPtr->hasUnicode = 1; + stringPtr->uallocated = uallocated; + stringPtr->storageType = Storage_Unicode; + stringPtr->range = Range_Unicode; + stringPtr->allocated = 0; + memcpy(stringPtr->data.unicode, unicode, uallocated); + stringPtr->data.unicode[numChars] = 0; TclInvalidateStringRep(objPtr); stringPtr->allocated = 0; } @@ -1132,12 +1511,11 @@ * objPtr's string rep. */ SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - - if (stringPtr->hasUnicode && stringPtr->numChars > 0) { + if (stringPtr->storageType == Storage_Unicode) { AppendUtfToUnicodeRep(objPtr, bytes, toCopy); } else { AppendUtfToUtfRep(objPtr, bytes, toCopy); } @@ -1144,12 +1522,12 @@ if (length <= limit) { return; } stringPtr = GET_STRING(objPtr); - if (stringPtr->hasUnicode && stringPtr->numChars > 0) { - AppendUtfToUnicodeRep(objPtr, ellipsis, strlen(ellipsis)); + if (stringPtr->storageType == Storage_Unicode) { + AppendUtfToUnicodeRep(objPtr, ellipsis, -1); } else { AppendUtfToUtfRep(objPtr, ellipsis, strlen(ellipsis)); } } @@ -1223,15 +1601,11 @@ * If objPtr has a valid Unicode rep, then append the "unicode" to the * objPtr's Unicode rep, otherwise the UTF conversion of "unicode" to * objPtr's string rep. */ - if (stringPtr->hasUnicode -#if COMPAT - && stringPtr->numChars > 0 -#endif - ) { + if (stringPtr->storageType == Storage_Unicode) { AppendUnicodeToUnicodeRep(objPtr, unicode, length); } else { AppendUnicodeToUtfRep(objPtr, unicode, length); } } @@ -1311,24 +1685,29 @@ /* * If objPtr has a valid Unicode rep, then get a Unicode string from * appendObjPtr and append it. */ - if (stringPtr->hasUnicode -#if COMPAT - && stringPtr->numChars > 0 -#endif - ) { + stringPtr = GET_STRING(objPtr); + if (stringPtr->storageType == Storage_Unicode) { /* * If appendObjPtr is not of the "String" type, don't convert it. */ if (appendObjPtr->typePtr == &tclStringType) { - Tcl_UniChar *unicode = - Tcl_GetUnicodeFromObj(appendObjPtr, &numChars); + stringPtr = GET_STRING(appendObjPtr); + if (stringPtr->storageType != Storage_Unicode) { + /* + * If appendObjPtr is a string obj with no valid Unicode rep, + * then fill its unicode rep. + */ - AppendUnicodeToUnicodeRep(objPtr, unicode, numChars); + FillUnicodeRep(appendObjPtr); + stringPtr = GET_STRING(appendObjPtr); + } + AppendUnicodeToUnicodeRep(objPtr, stringPtr->data.unicode, + stringPtr->numChars); } else { bytes = TclGetStringFromObj(appendObjPtr, &length); AppendUtfToUnicodeRep(objPtr, bytes, length); } return; @@ -1338,10 +1717,11 @@ * Append to objPtr's UTF string rep. If we know the number of characters * in both objects before appending, then set the combined number of * characters in the final (appended-to) object. */ + Tcl_GetString(objPtr); bytes = TclGetStringFromObj(appendObjPtr, &length); numChars = stringPtr->numChars; if ((numChars >= 0) && (appendObjPtr->typePtr == &tclStringType)) { String *appendStringPtr = GET_STRING(appendObjPtr); @@ -1348,16 +1728,14 @@ appendNumChars = appendStringPtr->numChars; } AppendUtfToUtfRep(objPtr, bytes, length); - if (numChars >= 0 && appendNumChars >= 0 -#if COMPAT - && appendNumChars == length -#endif - ) { - stringPtr->numChars = numChars + appendNumChars; + if (allOneByteChars) { + stringPtr = GET_STRING(objPtr); + stringPtr->numChars = numChars; + stringPtr->range = Range_Ascii; } } /* *---------------------------------------------------------------------- @@ -1435,15 +1813,15 @@ /* * Copy the new string onto the end of the old string, then add the * trailing null. */ - memcpy(stringPtr->unicode + stringPtr->numChars, unicode, + memcpy(stringPtr->data.unicode + stringPtr->numChars, unicode, appendNumChars * sizeof(Tcl_UniChar)); - stringPtr->unicode[numChars] = 0; + stringPtr->data.unicode[numChars] = 0; stringPtr->numChars = numChars; - stringPtr->allocated = 0; + stringPtr->range = Range_Unicode; TclInvalidateStringRep(objPtr); } /* @@ -1601,11 +1979,11 @@ /* * Invalidate the unicode data. */ stringPtr->numChars = -1; - stringPtr->hasUnicode = 0; + stringPtr->storageType = Storage_None; memcpy(objPtr->bytes + oldLength, bytes, numBytes); objPtr->bytes[newLength] = 0; objPtr->length = newLength; } @@ -2633,10 +3011,107 @@ AppendPrintfToObjVA(objPtr, format, argList); va_end(argList); return objPtr; } +int +TclStringCompare( + Tcl_Obj *obj1Ptr, + Tcl_Obj *obj2Ptr, + int reqlength, + int nocase, + int equal) +{ + /* + * Remember to keep code here in some sync with the byte-compiled + * versions in tclExecute.c (INST_STR_EQ, INST_STR_NEQ and + * INST_STR_CMP as well as the expr string comparison in + * INST_EQ/INST_NEQ/INST_LT/...). + */ + + int match, length; + typedef int (*strCmpFn_t)(const char *, const char *, unsigned int); + strCmpFn_t strCmpFn; + char *string1, *string2; + int length1, length2; + + if ((reqlength == 0) || (obj1Ptr == obj2Ptr)) { + /* + * Always match at 0 chars of if it is the same obj. + */ + + return 0; + } else if (!nocase && obj1Ptr->typePtr == &tclByteArrayType && + obj2Ptr->typePtr == &tclByteArrayType) { + /* + * Use binary versions of comparisons since that won't cause undue + * type conversions and it is much faster. Only do this if we're + * case-sensitive (which is all that really makes sense with byte + * arrays anyway, and we have no memcasecmp() for some + * reason... :^) + */ + + string1 = (char *) Tcl_GetByteArrayFromObj(obj1Ptr, &length1); + string2 = (char *) Tcl_GetByteArrayFromObj(obj2Ptr, &length2); + strCmpFn = (strCmpFn_t) memcmp; + } else if ((obj1Ptr->typePtr == &tclStringType) + && (obj2Ptr->typePtr == &tclStringType)) { + /* + * Do a unicode-specific comparison if both of the args are of + * String type. In benchmark testing this proved the most + * efficient check between the unicode and string comparison + * operations. + */ + + string1 = (char *) Tcl_GetUnicodeFromObj(obj1Ptr, &length1); + string2 = (char *) Tcl_GetUnicodeFromObj(obj2Ptr, &length2); + strCmpFn = (strCmpFn_t) + (nocase ? Tcl_UniCharNcasecmp : Tcl_UniCharNcmp); + } else { + /* + * As a catch-all we will work with UTF-8. We cannot use memcmp() + * as that is unsafe with any string containing NULL (\xC0\x80 in + * Tcl's utf rep). We can use the more efficient TclpUtfNcmp2 if + * we are case-sensitive and no specific length was requested. + */ + + string1 = (char *) Tcl_GetStringFromObj(obj1Ptr, &length1); + string2 = (char *) Tcl_GetStringFromObj(obj2Ptr, &length2); + if ((reqlength < 0) && !nocase) { + strCmpFn = (strCmpFn_t) TclpUtfNcmp2; + } else { + length1 = Tcl_NumUtfChars(string1, length1); + length2 = Tcl_NumUtfChars(string2, length2); + strCmpFn = (strCmpFn_t) + (nocase ? Tcl_UtfNcasecmp : Tcl_UtfNcmp); + } + } + + if (equal && (reqlength < 0) && (length1 != length2)) { + return 1; + } + + length = (length1 < length2) ? length1 : length2; + if (reqlength > 0 && reqlength < length) { + length = reqlength; + } else if (reqlength < 0) { + /* + * The requested length is negative, so we ignore it by + * setting it to length + 1 so we correct the match var. + */ + + reqlength = length + 1; + } + + match = strCmpFn(string1, string2, (unsigned) length); + if ((match == 0) && (reqlength > length)) { + match = length1 - length2; + } + + return (match > 0) ? 1 : (match < 0) ? -1 : 0; +} + /* *--------------------------------------------------------------------------- * * TclStringObjReverse -- * @@ -2663,13 +3138,33 @@ Tcl_Obj *resultPtr = NULL; SetStringFromAny(NULL, objPtr); stringPtr = GET_STRING(objPtr); - if (stringPtr->hasUnicode == 0) { - if (stringPtr->numChars == -1) { - TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); + /* + * Make sure we have either fixed width UTF data or Unicode data present. + */ + + if ((stringPtr->storageType != Storage_Unicode) && + (stringPtr->range != Range_Ascii)) { + FillUnicodeRep(objPtr); + stringPtr = GET_STRING(objPtr); + } + + if (stringPtr->storageType == Storage_Unicode) { + Tcl_UniChar *source = stringPtr->data.unicode; + + if (Tcl_IsShared(objPtr)) { + Tcl_UniChar *dest; + + Tcl_Obj *resultPtr = Tcl_NewObj(); + dest = TclSetUnicodeLength(resultPtr, numChars); + + while (i < numChars) { + dest[i++] = source[lastCharIdx--]; + } + return resultPtr; } if (stringPtr->numChars <= 1) { return objPtr; } if (stringPtr->numChars == objPtr->length) { @@ -2753,11 +3248,11 @@ *--------------------------------------------------------------------------- * * FillUnicodeRep -- * * Populate the Unicode internal rep with the Unicode form of its string - * rep. The object must alread have a "String" internal rep. + * rep. The object must already have a "String" internal rep. * * Results: * None. * * Side effects: @@ -2769,45 +3264,88 @@ static void FillUnicodeRep( Tcl_Obj *objPtr) /* The object in which to fill the unicode * rep. */ { - String *stringPtr = GET_STRING(objPtr); - - ExtendUnicodeRepWithString(objPtr, objPtr->bytes, objPtr->length, - stringPtr->numChars); -} - -static void -ExtendUnicodeRepWithString( - Tcl_Obj *objPtr, - const char *bytes, - int numBytes, - int numAppendChars) -{ - String *stringPtr = GET_STRING(objPtr); - int needed, numOrigChars = 0; + String *stringPtr; + size_t uallocated; + char *srcEnd, *src = objPtr->bytes; + unsigned char *srcByte; Tcl_UniChar *dst; - if (stringPtr->hasUnicode) { - numOrigChars = stringPtr->numChars; - } - if (numAppendChars == -1) { - TclNumUtfChars(numAppendChars, bytes, numBytes); - } - needed = numOrigChars + numAppendChars; - stringCheckLimits(needed); - - if (needed > stringPtr->maxChars) { - GrowUnicodeBuffer(objPtr, needed); - stringPtr = GET_STRING(objPtr); - } - - stringPtr->hasUnicode = 1; - stringPtr->numChars = needed; - for (dst=stringPtr->unicode + numOrigChars; numAppendChars-- > 0; dst++) { - bytes += TclUtfToUniChar(bytes, dst); + stringPtr = GET_STRING(objPtr); + if (stringPtr->numChars == -1) { + Tcl_GetCharLength(objPtr); + } + if (stringPtr->storageType == Storage_Unicode) { + return; + } + + if (stringPtr->storageType == Storage_Byte) { + /* + * Convert ualloc count to unicode + */ + + stringPtr->uallocated = stringPtr->uallocated - 2; + } + + uallocated = STRING_UALLOC(stringPtr->numChars); + if (uallocated > stringPtr->uallocated) { + /* + * If not enough space has been allocated for the unicode rep, + * reallocate the internal rep object. + * + * There isn't currently enough space in the Unicode representation so + * allocate additional space. If the current Unicode representation + * isn't empty (i.e. it looks like we've done some appends) then + * overallocate the space so that we won't have to do as much + * reallocation in the future. + */ + + if (stringPtr->uallocated > 0) { + uallocated *= 2; + } + stringPtr = stringRealloc(stringPtr, uallocated); + stringPtr->uallocated = uallocated; + SET_STRING(objPtr, stringPtr); + } + + if (stringPtr->storageType == Storage_Byte) { + if (stringPtr->range <= Range_Byte) { + /* + * Convert byte to unicode directly. + */ + + srcByte = stringPtr->data.bytes + stringPtr->numChars - 1; + dst = stringPtr->data.unicode + stringPtr->numChars - 1; + while (srcByte >= stringPtr->data.bytes) { + *dst = (Tcl_UniChar) *srcByte; + dst--; + srcByte--; + } + stringPtr->data.unicode[stringPtr->numChars] = 0; + stringPtr->storageType = Storage_Unicode; + objPtr->typePtr = &tclStringType; + return; + } else { + if (objPtr->bytes == NULL) { + Tcl_GetString(objPtr); + } + } + } + + src = objPtr->bytes; + stringPtr->storageType = Storage_Unicode; + objPtr->typePtr = &tclStringType; + + /* + * Convert src to Unicode and store the coverted data in "unicode". + */ + + srcEnd = src + objPtr->length; + for (dst = stringPtr->data.unicode; src < srcEnd; dst++) { + src += TclUtfToUniChar(src, dst); } *dst = 0; } /* @@ -2883,37 +3421,42 @@ * rep of the source object and create an "empty" Unicode internal rep for * the new object. Otherwise, copy Unicode internal rep, and invalidate * the string rep of the new object. */ - if (srcStringPtr->hasUnicode && srcStringPtr->numChars > 0) { - /* - * Copy the full allocation for the Unicode buffer. - */ - - copyStringPtr = stringAlloc(srcStringPtr->maxChars); - copyStringPtr->maxChars = srcStringPtr->maxChars; - memcpy(copyStringPtr->unicode, srcStringPtr->unicode, - srcStringPtr->numChars * sizeof(Tcl_UniChar)); - copyStringPtr->unicode[srcStringPtr->numChars] = 0; - copyStringPtr->allocated = 0; + if (srcStringPtr->storageType == Storage_None) { + copyStringPtr = (String *) ckalloc(STRING_SIZE(STRING_UALLOC(0))); + copyStringPtr->uallocated = STRING_UALLOC(0); + } else if (srcStringPtr->storageType == Storage_Byte) { + copyStringPtr = (String *) ckalloc( + BYTEARRAY_SIZE(srcStringPtr->uallocated)); + copyStringPtr->uallocated = srcStringPtr->uallocated; + + memcpy((void *) copyStringPtr->data.bytes, + (void *) srcStringPtr->data.bytes, + (size_t) srcStringPtr->numChars); } else { - copyStringPtr = stringAlloc(0); - copyStringPtr->unicode[0] = 0; - copyStringPtr->maxChars = 0; - - /* - * Tricky point: the string value was copied by generic object - * management code, so it doesn't contain any extra bytes that might - * exist in the source object. - */ - - copyStringPtr->allocated = copyPtr->length; + copyStringPtr = stringAlloc(srcStringPtr->uallocated); + copyStringPtr->uallocated = srcStringPtr->uallocated; + + memcpy(copyStringPtr->data.unicode, + srcStringPtr->data.unicode, + (size_t) srcStringPtr->numChars * sizeof(Tcl_UniChar)); + copyStringPtr->data.unicode[srcStringPtr->numChars] = 0; } copyStringPtr->numChars = srcStringPtr->numChars; - copyStringPtr->hasUnicode = srcStringPtr->hasUnicode; -#endif /* COMPAT==0 */ + copyStringPtr->storageType = srcStringPtr->storageType; + copyStringPtr->allocated = srcStringPtr->allocated; + copyStringPtr->range = srcStringPtr->range; + + /* + * Tricky point: the string value was copied by generic object management + * code, so it doesn't contain any extra bytes that might exist in the + * source object. + */ + + copyStringPtr->allocated = copyPtr->length; SET_STRING(copyPtr, copyStringPtr); copyPtr->typePtr = &tclStringType; } @@ -2937,12 +3480,19 @@ static int SetStringFromAny( Tcl_Interp *interp, /* Used for error reporting if not NULL. */ Tcl_Obj *objPtr) /* The object to convert. */ { - if (objPtr->typePtr != &tclStringType) { - String *stringPtr = stringAlloc(0); + /* + * The Unicode object is optimized for the case where each UTF char in a + * string is only one byte. In this case, we store the value of numChars, + * but we don't copy the bytes to the unicodeObj->unicode. + */ + + if ((objPtr->typePtr != &tclStringType) && + (objPtr->typePtr != &tclByteArrayType)) { + String *stringPtr; /* * Convert whatever we have into an untyped value. Just A String. */ @@ -2953,18 +3503,129 @@ * Create a basic String intrep that just points to the UTF-8 string * already in place at objPtr->bytes. */ stringPtr->numChars = -1; - stringPtr->allocated = objPtr->length; - stringPtr->maxChars = 0; - stringPtr->hasUnicode = 0; + stringPtr->uallocated = STRING_UALLOC(0); + stringPtr->storageType = Storage_None; + + if (objPtr->bytes != NULL) { + stringPtr->allocated = objPtr->length; + objPtr->bytes[objPtr->length] = 0; + } else { + objPtr->length = 0; + } SET_STRING(objPtr, stringPtr); objPtr->typePtr = &tclStringType; } return TCL_OK; } + +static int +SetByteArrayFromAny( + Tcl_Interp *interp, /* Used for error reporting if not NULL. */ + register Tcl_Obj *objPtr) /* The object to convert. */ +{ + String *stringPtr; + CharacterRange range = Range_Unicode; + int length, count; + + if (objPtr->typePtr == &tclByteArrayType) { + return TCL_OK; + } + if (objPtr->typePtr != &tclStringType) { + SetStringFromAny(interp, objPtr); + } + stringPtr = GET_STRING(objPtr); + + objPtr->typePtr = &tclByteArrayType; + if (stringPtr->storageType == Storage_Byte) { + return TCL_OK; + } + + length = Tcl_GetCharLength(objPtr); + + if (stringPtr->storageType != Storage_Unicode) { + /* + * Since the unicode representation is larger than the byte rep + * we only need to allocate storage when there is none. + */ + + size_t uallocated = BYTEARRAY_UALLOC(length); + stringPtr = (String *) ckrealloc((char*) stringPtr, + BYTEARRAY_SIZE(uallocated)); + SET_STRING(objPtr, stringPtr); + stringPtr->uallocated = uallocated; + } else { + /* + * Change allocation count from unicode to bytes. + */ + + stringPtr->uallocated = stringPtr->uallocated + 2; + } + + if (stringPtr->range == Range_Ascii) { + memcpy((void *) stringPtr->data.bytes, objPtr->bytes, length); + stringPtr->storageType = Storage_Byte; + return TCL_OK; + } + + if (stringPtr->storageType == Storage_Unicode) { + Tcl_UniChar *srcPtr = stringPtr->data.unicode; + unsigned char *dstPtr = stringPtr->data.bytes; + + /* + * Look for any char outside of byte range. + */ + + count = length; + range = Range_Byte; + while (count-- > 0) { + if (*srcPtr++ > 255) { + range = Range_Unicode; + break; + } + } + + /* + * If a pure Unicode object contains chars outside the byte range we + * must fill in the UTF representation to not lose information. + */ + + if (range == Range_Unicode) { + Tcl_GetString(objPtr); + } + + count = length; + srcPtr = stringPtr->data.unicode; + while (count-- > 0) { + *dstPtr = (unsigned char) *srcPtr; + dstPtr++; + srcPtr++; + } + + stringPtr->storageType = Storage_Byte; + stringPtr->range = range; + } else { + Tcl_UniChar ch; + char *srcPtr = Tcl_GetStringFromObj(objPtr, &length); + unsigned char *dstPtr = stringPtr->data.bytes; + char *srcEnd = srcPtr + length; + + range = Range_Byte; + while (srcPtr < srcEnd) { + srcPtr += Tcl_UtfToUniChar(srcPtr, &ch); + *dstPtr++ = (unsigned char) ch; + if (ch > 255) { + range = Range_Unicode; + } + } + stringPtr->storageType = Storage_Byte; + stringPtr->range = range; + } + return TCL_OK; +} /* *---------------------------------------------------------------------- * * UpdateStringOfString -- @@ -2984,69 +3645,76 @@ static void UpdateStringOfString( Tcl_Obj *objPtr) /* Object with string rep to update. */ { - String *stringPtr = GET_STRING(objPtr); - - if (stringPtr->numChars == 0) { - TclInitStringRep(objPtr, tclEmptyStringRep, 0); - } else { - (void) ExtendStringRepWithUnicode(objPtr, stringPtr->unicode, - stringPtr->numChars); - } -} - -static int -ExtendStringRepWithUnicode( - Tcl_Obj *objPtr, - const Tcl_UniChar *unicode, - int numChars) -{ - /* - * Pre-condition: this is the "string" Tcl_ObjType. - */ - - int i, origLength, size = 0; - char *dst, buf[TCL_UTF_MAX]; - String *stringPtr = GET_STRING(objPtr); - - if (numChars < 0) { - numChars = UnicodeLength(unicode); - } - - if (numChars == 0) { - return 0; - } - - if (objPtr->bytes == NULL) { - objPtr->length = 0; - } - size = origLength = objPtr->length; - - /* - * Quick cheap check in case we have more than enough room. - */ - - if (numChars <= (INT_MAX - size)/TCL_UTF_MAX - && stringPtr->allocated >= size + numChars * TCL_UTF_MAX) { - goto copyBytes; - } - - for (i = 0; i < numChars && size >= 0; i++) { - size += Tcl_UniCharToUtf((int) unicode[i], buf); - } - if (size < 0) { - Tcl_Panic("max size for a Tcl value (%d bytes) exceeded", INT_MAX); - } - - /* - * Grow space if needed. - */ - - if (size > stringPtr->allocated) { - GrowStringBuffer(objPtr, size, 1); + int i, size; + Tcl_UniChar *unicode; + char dummy[TCL_UTF_MAX]; + unsigned char *src; + char *dst; + String *stringPtr; + + stringPtr = GET_STRING(objPtr); + if ((objPtr->bytes == NULL) || (stringPtr->allocated == 0)) { + if (stringPtr->storageType == Storage_None) { + /* + * If there is no Unicode rep, or the string has 0 chars, then set + * the string rep to an empty string. + */ + String *stringPtr = GET_STRING(objPtr); + + if (stringPtr->storageType == Storage_Byte) { + /* + * How much space will string rep need? + */ + + size = stringPtr->numChars; + src = stringPtr->data.bytes; + for (i = 0; i < stringPtr->numChars; i++) { + if ((src[i] == 0) || (src[i] > 127)) { + size++; + } + } + + dst = (char *) ckalloc((unsigned) (size + 1)); + objPtr->bytes = dst; + objPtr->length = size; + stringPtr->allocated = size; + + if (size == stringPtr->numChars) { + memcpy(dst, src, (size_t) size); + dst[size] = '\0'; + } else { + for (i = 0; i < stringPtr->numChars; i++) { + dst += Tcl_UniCharToUtf(src[i], dst); + } + *dst = '\0'; + } + } else { + unicode = stringPtr->data.unicode; + + /* + * Translate the Unicode string to UTF. "size" will hold the amount + * of space the UTF string needs. + */ + + size = 0; + for (i = 0; i < stringPtr->numChars; i++) { + size += Tcl_UniCharToUtf((int) unicode[i], dummy); + } + + dst = (char *) ckalloc((unsigned) (size + 1)); + objPtr->bytes = dst; + objPtr->length = size; + stringPtr->allocated = size; + + for (i = 0; i < stringPtr->numChars; i++) { + dst += Tcl_UniCharToUtf(unicode[i], dst); + } + *dst = '\0'; + } } copyBytes: dst = objPtr->bytes + origLength; for (i = 0; i < numChars; i++) { Index: generic/tclTestObj.c ================================================================== --- generic/tclTestObj.c +++ generic/tclTestObj.c @@ -55,12 +55,14 @@ static int TeststringobjCmd(ClientData dummy, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[]); typedef struct TestString { int numChars; - int allocated; - int maxChars; + size_t allocated; + size_t uallocated; + int range; + int storageType; Tcl_UniChar unicode[2]; } TestString; /* *---------------------------------------------------------------------- @@ -1026,11 +1028,25 @@ return TCL_ERROR; } if (CheckIfVarUnset(interp, varIndex)) { return TCL_ERROR; } - Tcl_SetObjResult(interp, Tcl_NewIntObj(varPtr[varIndex]->refCount)); + TclFormatInt(buf, varPtr[varIndex]->refCount); + Tcl_SetResult(interp, buf, TCL_VOLATILE); + } else if (strcmp(subCmd, "pure") == 0) { + if (objc != 3) { + goto wrongNumArgs; + } + index = Tcl_GetString(objv[2]); + if (GetVariableIndex(interp, index, &varIndex) != TCL_OK) { + return TCL_ERROR; + } + if (CheckIfVarUnset(interp, varIndex)) { + return TCL_ERROR; + } + Tcl_SetObjResult(interp, + Tcl_NewBooleanObj(varPtr[varIndex]->bytes == NULL)); } else if (strcmp(subCmd, "type") == 0) { if (objc != 3) { goto wrongNumArgs; } index = Tcl_GetString(objv[2]); @@ -1092,14 +1108,16 @@ Tcl_UniChar *unicode; int varIndex, option, i, length; #define MAX_STRINGS 11 const char *index, *string, *strings[MAX_STRINGS+1]; TestString *strPtr; - static const char *const options[] = { + Tcl_UniChar uniChar, *uniPtr; + Tcl_Obj *objPtr; + static const char *options[] = { "append", "appendstrings", "get", "get2", "length", "length2", - "set", "set2", "setlength", "maxchars", "getunicode", - "appendself", "appendself2", NULL + "set", "set2", "setlength", "ualloc", "getunicode", "range", "getunichar", + "setunicode", "appendunicode", NULL }; if (objc < 3) { wrongNumArgs: Tcl_WrongNumArgs(interp, 1, objv, "option arg ?arg ...?"); @@ -1265,70 +1283,52 @@ if (objc != 3) { goto wrongNumArgs; } Tcl_GetUnicodeFromObj(varPtr[varIndex], NULL); break; - case 11: /* appendself */ + case 11: /* range */ + if (objc != 3) { + goto wrongNumArgs; + } + if (varPtr[varIndex] != NULL) { + strPtr = (TestString *) + (varPtr[varIndex])->internalRep.otherValuePtr; + length = (int) strPtr->range; + } else { + length = -1; + } + Tcl_SetIntObj(Tcl_GetObjResult(interp), length); + break; + case 12: /* getunichar */ if (objc != 4) { goto wrongNumArgs; } - if (varPtr[varIndex] == NULL) { - SetVarToObj(varIndex, Tcl_NewObj()); - } - - /* - * If the object bound to variable "varIndex" is shared, we must - * "copy on write" and append to a copy of the object. - */ - - if (Tcl_IsShared(varPtr[varIndex])) { - SetVarToObj(varIndex, Tcl_DuplicateObj(varPtr[varIndex])); - } - - string = Tcl_GetStringFromObj(varPtr[varIndex], &length); - if (Tcl_GetIntFromObj(interp, objv[3], &i) != TCL_OK) { return TCL_ERROR; } - if ((i < 0) || (i > length)) { - Tcl_SetObjResult(interp, Tcl_NewStringObj( - "index value out of range", -1)); - return TCL_ERROR; - } - - Tcl_AppendToObj(varPtr[varIndex], string + i, length - i); - Tcl_SetObjResult(interp, varPtr[varIndex]); - break; - case 12: /* appendself2 */ + uniChar = Tcl_GetUniChar(varPtr[varIndex], i); + Tcl_SetObjResult(interp, Tcl_NewUnicodeObj(&uniChar, 1)); + break; + case 13: /* setunicode */ + if (objc != 4) { + goto wrongNumArgs; + } + uniPtr = Tcl_GetUnicodeFromObj(objv[3], &length); + objPtr = Tcl_NewObj(); + Tcl_SetUnicodeObj(objPtr, uniPtr, length); + SetVarToObj(varIndex, objPtr); + break; + case 14: /* appendunicode */ if (objc != 4) { goto wrongNumArgs; } if (varPtr[varIndex] == NULL) { SetVarToObj(varIndex, Tcl_NewObj()); } - - /* - * If the object bound to variable "varIndex" is shared, we must - * "copy on write" and append to a copy of the object. - */ - - if (Tcl_IsShared(varPtr[varIndex])) { - SetVarToObj(varIndex, Tcl_DuplicateObj(varPtr[varIndex])); - } - - unicode = Tcl_GetUnicodeFromObj(varPtr[varIndex], &length); - - if (Tcl_GetIntFromObj(interp, objv[3], &i) != TCL_OK) { - return TCL_ERROR; - } - if ((i < 0) || (i > length)) { - Tcl_SetObjResult(interp, Tcl_NewStringObj( - "index value out of range", -1)); - return TCL_ERROR; - } - - Tcl_AppendUnicodeToObj(varPtr[varIndex], unicode + i, length - i); + uniPtr = Tcl_GetUnicodeFromObj(objv[3], &length); + objPtr = Tcl_NewObj(); + Tcl_AppendUnicodeToObj(varPtr[varIndex], uniPtr, length); Tcl_SetObjResult(interp, varPtr[varIndex]); break; } return TCL_OK; ADDED stringbyte.txt Index: stringbyte.txt ================================================================== --- /dev/null +++ stringbyte.txt @@ -0,0 +1,94 @@ +This is the original write down of the Unify idea, from Sep 2007... + + +Currently there are two internal reps called "string" and "bytearray". +They share some common properties including storing the length of the +string and a fixed-sized-per-char array allowing O(1) indexing. + +Currently any place that handles strings, such as indexing, ranging, +comparing etc, needs to handle both intreps to avoid shimmering +and preserve efficiency. + +Unifying these into one intrep might help simplifying the code +and maybe make things more efficient. +It will also make shimmering more efficient when needed. + +Properties of "string": + Stores length in characters. + Stores info about extra allocated utf-8 space. + Optionally stores 16-bit unicode array, handling extra space. + Knows if utf-8 is one-byte-per-char. + +Properties of "bytearray": + Stores length in characters. + Stores 8-bit byte array, handling extra space. + +A current bug in bytearray is that it does not handle if a char +outside of 0-255 is present in the utf-8 string, i.e. if indexing +into the bytearray is really valid. This does not matter in any +sane code, but to be picky it is a bug. + + +Does a unified version need the ability to keep both a byte and a +unicode rep? +If not, the data can be in a union at the end of the struct keeping +the entire intrep in one block as the current intreps do. +If yes, the intrep could need three memory blocks, the struct, the +byte data and the unicode data. This would make handling generally +trickier. +I would say no to this currently. + + +A combined structure could contain: + The length in characters. + Size of utf-8 allocation. + If utf-8 is one-byte-per-char (all chars within 1-127). + If all chars are within 0-255 (pure bytearray). + If stored array is not present, byte or unicode. + Size of array allocation. + Data Union + +The property "utf-8 is one-byte-per-char" is currently +indicated by numChars != -1 and hasUnicode == 0. +A special field for this combined with a value for a pure bytearray +covers all needs. This could be expanded in the future if +"Unicode" is split into UTF-16 and real Unicode. + +typedef enum { + Ascii, Byte, Unicode +} CharacterRange; + +The presence of data can be indicated with an enum. This +can also be expanded if UTF-16/Unicode needs to be separated. + +typedef enum { + None, Byte, Unicode +} StorageType; + +Giving something like this: + +typedef struct String { + int numChars; + size_t allocated; + CharacterRange range; + StorageType storageType; + size_t allocatedStorage; + union { + unsigned char bytes[4]; + Tcl_UniChar unicode[2]; + } data; +} String; + +This only expands the String array with one integer. + + +Implementation: +All ByteArray APIs move to tclStringObj.c +The binary command stays in tclBinary.c + +Shimmering from unicode to byte is a rather cheap operation +since it can be done in-place. + +Compatibility: +If anyone accesses through e.g. Tcl_GetObjType("bytearray") it gets +tricker to maintain full compatiblity. I wonder if this is needed? Index: tests/stringObj.test ================================================================== --- tests/stringObj.test +++ tests/stringObj.test @@ -395,10 +395,28 @@ list [string index $x 4] [string index $x 0] } "\u00ae \u00ef" test stringObj-12.6 {Tcl_GetUniChar} testobj { string index "\u00efa\u00bfb\u00aec\u00ef\u00bfd\u00ae" end } "\u00ae" + +test stringObj-12.7 {Tcl_GetUniChar with byte-size chars} testobj { + teststringobj set2 1 "abcdefghi" + list [teststringobj getunichar 1 0] [teststringobj getunichar 1 1] +} {a b} + +test stringObj-12.8 {Tcl_GetUniChar with unicode rep} testobj { + set a "abcd\u129fefghi" + teststringobj set2 1 $a + list [teststringobj getunichar 1 4] [teststringobj getunichar 1 6] +} [list \u129f f] + +test stringObj-12.9 {Tcl_GetUniChar with byte rep} testobj { + set a "abcd\u009fefghi" + binary scan $a I dummy + teststringobj set2 1 $a + list [teststringobj getunichar 1 4] [teststringobj getunichar 1 6] +} [list \u009f f] test stringObj-13.1 {Tcl_GetCharLength with byte-size chars} testobj { set a "" list [string length $a] [string length $a] } {0 0} @@ -475,13 +493,141 @@ test stringObj-15.8 {Tcl_Append*ToObj: self appends} { teststringobj set 1 foo teststringobj appendself2 1 3 } foo +test stringObj-15.1 {byte/unicode shimmering, start from byte} testobj { + set res "" + # Pure byte object + teststringobj set2 1 [binary format c* {1 5 8 9 78 178 250 180}] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Still byte object + string length [teststringobj get 1] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Unicode object + regexp {.*} [teststringobj get 1] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Byte object + binary scan [teststringobj get 1] I dummy + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Unicode object + regexp {.*} [teststringobj get 1] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + +} {bytearray 8 1 bytearray 8 1 string 32 1 bytearray 34 1 string 32 1} + +test stringObj-15.2 {byte/unicode shimmering, start from unicode} testobj { + set res "" + # Pure unicode object + teststringobj set 1 a\u0095cdef\u00EFh + teststringobj getunicode 1 + testobj invalidateStringRep 1 + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Still unicode object + string length [teststringobj get 1] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Byte object + binary scan [teststringobj get 1] I dummy + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Unicode object + regexp {.*} [teststringobj get 1] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Byte object + binary scan [teststringobj get 1] I dummy + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + +} {string 16 1 string 16 1 bytearray 18 1 string 16 1 bytearray 18 1} + +test stringObj-15.3 {byte/unicode shimmering, out of byte range} testobj { + set res "" + # Pure unicode object + teststringobj set 1 a\u1295cdef\u00EFh + teststringobj getunicode 1 + testobj invalidateStringRep 1 + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + lappend res [string range [teststringobj get 1] 0 end] + + # Byte object + binary scan [teststringobj get 1] I dummy + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Reading byte object with out-of-range data + lappend res [string range [teststringobj get 1] 0 end] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Byte object, without utf + binary scan [teststringobj get 1] I dummy + testobj invalidateStringRep 1 + teststringobj getunicode 1 + lappend res [string range [teststringobj get 1] 0 end] + +} [list string 16 1 a\u1295cdef\u00EFh bytearray 18 0 a\u1295cdef\u00EFh string 16 0 a\u0095cdef\u00EFh] + +test stringObj-15.4 {byte/unicode shimmering, one-char} testobj { + set res "" + # Pure byte object + teststringobj set2 1 [binary format c 178] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] + + # Unicode object + regexp {.*} [teststringobj get 1] + lappend res [testobj type 1] + lappend res [teststringobj ualloc 1] + lappend res [testobj pure 1] +} {bytearray 4 1 string 2 1} + +test stringObj-16.1 {Tcl_SetUnicodeObj} testobj { + teststringobj setunicode 1 foo\u1234bar + testobj type 1 +} {string} + +test stringObj-17.1 {Tcl_AppendUnicodeToObj} testobj { + set res "" + teststringobj set 1 apa + teststringobj appendunicode 1 foo\u1234bar + lappend res [testobj type 1] + lappend res [teststringobj get 1] +} [list string apafoo\u1234bar] if {[testConstraint testobj]} { testobj freeallvars } # cleanup ::tcltest::cleanupTests return