/*
******************************************************************************
*
-* Copyright (C) 1999-2007, International Business Machines
+* Copyright (C) 1999-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
#define GROUP_MASK (LINES_PER_GROUP-1)
+/*
+ * This struct was replaced by explicitly accessing equivalent
+ * fields from triples of uint16_t.
+ * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
+ * which broke the assumption that sizeof(Group)==6 and that the ++ operator
+ * would advance by 6 bytes (3 uint16_t).
+ *
+ * We can't just change the data structure because it's loaded from a data file,
+ * and we don't want to make it less compact, so we changed the access code.
+ *
+ * For details see ICU tickets 6331 and 6008.
typedef struct {
uint16_t groupMSB,
- offsetHigh, offsetLow; /* avoid padding */
+ offsetHigh, offsetLow; / * avoid padding * /
} Group;
+ */
+enum {
+ GROUP_MSB,
+ GROUP_OFFSET_HIGH,
+ GROUP_OFFSET_LOW,
+ GROUP_LENGTH
+};
+
+/*
+ * Get the 32-bit group offset.
+ * @param group (const uint16_t *) pointer to a Group triple of uint16_t
+ * @return group offset (int32_t)
+ */
+#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
+
+#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
+#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
typedef struct {
uint32_t start, end;
uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
} UCharNames;
+/*
+ * Get the groups table from a UCharNames struct.
+ * The groups table consists of one uint16_t groupCount followed by
+ * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
+ * and the comment for the old struct Group above.
+ *
+ * @param names (const UCharNames *) pointer to the UCharNames indexes
+ * @return (const uint16_t *) pointer to the groups table
+ */
+#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
+
typedef struct {
const char *otherName;
UChar32 code;
{
umtx_lock(NULL);
if(uCharNames==NULL) {
- uCharNames=names;
uCharNamesData=data;
+ uCharNames=names;
data=NULL;
names=NULL;
ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
* field can contain ';' as part of its contents.
* In unames.dat, it is marked as token[';']==-1 only if the
* semicolon is used in the data file - which is iff we
- * have Unicode 1.0 names or ISO comments.
- * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
+ * have Unicode 1.0 names or ISO comments or aliases.
+ * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
* although we know that it will never be part of a name.
*/
static uint16_t
uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
uint8_t c;
- if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==U_ISO_COMMENT) {
+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
/*
* skip the modern name if it is not requested _and_
* if the semicolon byte value is a character, not a token number
*/
if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
- while(nameLength>0) {
- --nameLength;
- if(*name++==';') {
- break;
- }
- }
- if(nameChoice==U_ISO_COMMENT) {
- /* skip the Unicode 1.0 name as well to get the ISO comment */
+ int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
+ do {
while(nameLength>0) {
--nameLength;
if(*name++==';') {
break;
}
}
- }
+ } while(--fieldIndex>0);
} else {
/*
* the semicolon byte value is a token number, therefore
* only modern names are stored in unames.dat and there is no
- * such requested Unicode 1.0 name here
+ * such requested alternate name here
*/
nameLength=0;
}
uint8_t c;
const char *origOtherName = otherName;
- if(nameChoice==U_UNICODE_10_CHAR_NAME) {
+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
/*
* skip the modern name if it is not requested _and_
* if the semicolon byte value is a character, not a token number
*/
if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
- while(nameLength>0) {
- --nameLength;
- if(*name++==';') {
- break;
+ int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
+ do {
+ while(nameLength>0) {
+ --nameLength;
+ if(*name++==';') {
+ break;
+ }
}
- }
+ } while(--fieldIndex>0);
} else {
/*
* the semicolon byte value is a token number, therefore
* only modern names are stored in unames.dat and there is no
- * such requested Unicode 1.0 name here
+ * such requested alternate name here
*/
nameLength=0;
}
* or else is the highest group before "code".
* If the lowest group is after "code", then that one is returned.
*/
-static Group *
+static const uint16_t *
getGroup(UCharNames *names, uint32_t code) {
+ const uint16_t *groups=GET_GROUPS(names);
uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
start=0,
- limit=*(uint16_t *)((char *)names+names->groupsOffset),
+ limit=*groups++,
number;
- Group *groups=(Group *)((char *)names+names->groupsOffset+2);
/* binary search for the group of names that contains the one for code */
while(start<limit-1) {
number=(uint16_t)((start+limit)/2);
- if(groupMSB<groups[number].groupMSB) {
+ if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
limit=number;
} else {
start=number;
}
/* return this regardless of whether it is an exact match */
- return groups+start;
+ return groups+start*GROUP_LENGTH;
}
/*
}
static uint16_t
-expandGroupName(UCharNames *names, Group *group,
+expandGroupName(UCharNames *names, const uint16_t *group,
uint16_t lineNumber, UCharNameChoice nameChoice,
char *buffer, uint16_t bufferLength) {
uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
- const uint8_t *s=(uint8_t *)names+names->groupStringOffset+
- (group->offsetHigh<<16|group->offsetLow);
+ const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
s=expandGroupLengths(s, offsets, lengths);
return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
buffer, bufferLength);
static uint16_t
getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
char *buffer, uint16_t bufferLength) {
- Group *group=getGroup(names, code);
- if((uint16_t)(code>>GROUP_SHIFT)==group->groupMSB) {
+ const uint16_t *group=getGroup(names, code);
+ if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
buffer, bufferLength);
} else {
* and either calls the enumerator function or finds a given input name.
*/
static UBool
-enumGroupNames(UCharNames *names, Group *group,
+enumGroupNames(UCharNames *names, const uint16_t *group,
UChar32 start, UChar32 end,
UEnumCharNamesFn *fn, void *context,
UCharNameChoice nameChoice) {
uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
- const uint8_t *s=(uint8_t *)names+names->groupStringOffset+
- (group->offsetHigh<<16|group->offsetLow);
+ const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
s=expandGroupLengths(s, offsets, lengths);
if(fn!=DO_FIND_NAME) {
UEnumCharNamesFn *fn, void *context,
UCharNameChoice nameChoice) {
uint16_t startGroupMSB, endGroupMSB, groupCount;
- Group *group, *groupLimit;
+ const uint16_t *group, *groupLimit;
startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
group=getGroup(names, start);
if(startGroupMSB==endGroupMSB) {
- if(startGroupMSB==group->groupMSB) {
+ if(startGroupMSB==group[GROUP_MSB]) {
/* if start and limit-1 are in the same group, then enumerate only in that one */
return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
}
} else {
- groupCount=*(uint16_t *)((char *)names+names->groupsOffset);
- groupLimit=(Group *)((char *)names+names->groupsOffset+2)+groupCount;
+ const uint16_t *groups=GET_GROUPS(names);
+ groupCount=*groups++;
+ groupLimit=groups+groupCount*GROUP_LENGTH;
- if(startGroupMSB==group->groupMSB) {
+ if(startGroupMSB==group[GROUP_MSB]) {
/* enumerate characters in the partial start group */
if((start&GROUP_MASK)!=0) {
if(!enumGroupNames(names, group,
fn, context, nameChoice)) {
return FALSE;
}
- ++group; /* continue with the next group */
+ group=NEXT_GROUP(group); /* continue with the next group */
}
- } else if(startGroupMSB>group->groupMSB) {
+ } else if(startGroupMSB>group[GROUP_MSB]) {
/* make sure that we start enumerating with the first group after start */
- if (group + 1 < groupLimit && (group + 1)->groupMSB > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
- UChar32 end = (group + 1)->groupMSB << GROUP_SHIFT;
+ const uint16_t *nextGroup=NEXT_GROUP(group);
+ if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
+ UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
if (end > limit) {
end = limit;
}
return FALSE;
}
}
- ++group;
+ group=nextGroup;
}
/* enumerate entire groups between the start- and end-groups */
- while(group<groupLimit && group->groupMSB<endGroupMSB) {
- start=(UChar32)group->groupMSB<<GROUP_SHIFT;
+ while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
+ const uint16_t *nextGroup;
+ start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
return FALSE;
}
- if (group + 1 < groupLimit && (group + 1)->groupMSB > group->groupMSB + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
- UChar32 end = (group + 1)->groupMSB << GROUP_SHIFT;
+ nextGroup=NEXT_GROUP(group);
+ if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
+ UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
if (end > limit) {
end = limit;
}
- if (!enumExtNames((group->groupMSB + 1) << GROUP_SHIFT, end - 1, fn, context)) {
+ if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
return FALSE;
}
}
- ++group;
+ group=nextGroup;
}
- /* enumerate within the end group (group->groupMSB==endGroupMSB) */
- if(group<groupLimit && group->groupMSB==endGroupMSB) {
+ /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
+ if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
} else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
- UChar32 next = ((group - 1)->groupMSB + 1) << GROUP_SHIFT;
+ UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
if (next > start) {
start = next;
}
char *buffer, uint16_t bufferLength) {
uint16_t bufferPos=0;
- /*
- * Do not write algorithmic Unicode 1.0 names because
- * Unihan names are the same as the modern ones,
- * extension A was only introduced with Unicode 3.0, and
- * the Hangul syllable block was moved and changed around Unicode 1.1.5.
- */
- if(nameChoice==U_UNICODE_10_CHAR_NAME) {
+ /* Only the normative character name can be algorithmic. */
+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
/* zero-terminate */
if(bufferLength>0) {
*buffer=0;
char buffer[200];
uint16_t length;
- if(nameChoice==U_UNICODE_10_CHAR_NAME) {
+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
return TRUE;
}
uint16_t count=range->variant;
const char *s=(const char *)(factors+count);
char *suffix, *t;
- uint16_t prefixLength, i, index;
+ uint16_t prefixLength, i, idx;
char c;
/* increment the indexes in lexical order bound by the factors */
i=count;
for (;;) {
- index=(uint16_t)(indexes[--i]+1);
- if(index<factors[i]) {
+ idx=(uint16_t)(indexes[--i]+1);
+ if(idx<factors[i]) {
/* skip one index and its element string */
- indexes[i]=index;
+ indexes[i]=idx;
s=elements[i];
while(*s++!=0) {
}
findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
UChar32 code;
- if(nameChoice==U_UNICODE_10_CHAR_NAME) {
+ if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
return 0xffff;
}
uint16_t count=range->variant;
const char *s=(const char *)(factors+count), *t;
UChar32 start, limit;
- uint16_t i, index;
+ uint16_t i, idx;
char c;
/* increment the indexes in lexical order bound by the factors */
i=count;
for (;;) {
- index=(uint16_t)(indexes[--i]+1);
- if(index<factors[i]) {
+ idx=(uint16_t)(indexes[--i]+1);
+ if(idx<factors[i]) {
/* skip one index and its element string */
- indexes[i]=index;
+ indexes[i]=idx;
s=elements[i];
while(*s++!=0) {}
elements[i]=s;
int8_t *tokenLengths;
- uint16_t *groups;
- Group *group;
+ const uint16_t *group;
const uint8_t *s, *line, *lineLimit;
int32_t groupCount, lineNumber, length;
uprv_memset(tokenLengths, 0, tokenCount);
}
- groups=(uint16_t *)((char *)uCharNames+uCharNames->groupsOffset);
- groupCount=*groups++;
- group=(Group *)groups;
+ group=GET_GROUPS(uCharNames);
+ groupCount=*group++;
/* enumerate all groups */
while(groupCount>0) {
- s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+
- ((int32_t)group->offsetHigh<<16|group->offsetLow);
+ s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
s=expandGroupLengths(s, offsets, lengths);
/* enumerate all lines in each group */
/*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
}
- ++group;
+ group=NEXT_GROUP(group);
--groupCount;
}