git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2000-2004, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	* file name: ucnvhz.c
	7	* encoding: US-ASCII
	8	* tab size: 8 (not used)
	9	* indentation:4
	10	*
	11	* created on: 2000oct16
	12	* created by: Ram Viswanadha
	13	* 10/31/2000 Ram Implemented offsets logic function
	14	*
	15	*/
	16
	17	#include "unicode/utypes.h"
	18
	19	#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
	20
	21	#include "cmemory.h"
	22	#include "unicode/ucnv.h"
	23	#include "unicode/ucnv_cb.h"
	24	#include "unicode/uset.h"
	25	#include "ucnv_bld.h"
	26	#include "ucnv_cnv.h"
	27
	28	#define UCNV_TILDE 0x7E /* ~ */
	29	#define UCNV_OPEN_BRACE 0x7B /* { */
	30	#define UCNV_CLOSE_BRACE 0x7D /* } */
	31	#define SB_ESCAPE "\x7E\x7D"
	32	#define DB_ESCAPE "\x7E\x7B"
	33	#define TILDE_ESCAPE "\x7E\x7E"
	34	#define ESC_LEN 2
	35
	36
	37	#define CONCAT_ESCAPE_MACRO( args, targetIndex,targetLength,strToAppend, err, len,sourceIndex){ \
	38	while(len-->0){ \
	39	if(targetIndex < targetLength){ \
	40	args->target[targetIndex] = (unsigned char) *strToAppend; \
	41	if(args->offsets!=NULL){ \
	42	*(offsets++) = sourceIndex-1; \
	43	} \
	44	targetIndex++; \
	45	} \
	46	else{ \
	47	args->converter->charErrorBuffer[(int)args->converter->charErrorBufferLength++] = (unsigned char) *strToAppend; \
	48	*err =U_BUFFER_OVERFLOW_ERROR; \
	49	} \
	50	strToAppend++; \
	51	} \
	52	}
	53
	54
	55	typedef struct{
	56	int32_t targetIndex;
	57	int32_t sourceIndex;
	58	UBool isEscapeAppended;
	59	UConverter* gbConverter;
	60	UBool isStateDBCS;
	61	UBool isTargetUCharDBCS;
	62	}UConverterDataHZ;
	63
	64
	65
	66	static void
	67	_HZOpen(UConverter cnv, const char name,const char locale,uint32_t options, UErrorCode errorCode){
	68	cnv->toUnicodeStatus = 0;
	69	cnv->fromUnicodeStatus= 0;
	70	cnv->mode=0;
	71	cnv->fromUChar32=0x0000;
	72	cnv->extraInfo = uprv_malloc (sizeof (UConverterDataHZ));
	73	if(cnv->extraInfo != NULL){
	74	((UConverterDataHZ*)cnv->extraInfo)->gbConverter = ucnv_open("ibm-1386",errorCode);
	75	((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
	76	((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
	77	((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
	78	((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
	79	((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
	80	}
	81	/* test for NULL */
	82	else {
	83	*errorCode = U_MEMORY_ALLOCATION_ERROR;
	84	return;
	85	}
	86	}
	87
	88	static void
	89	_HZClose(UConverter *cnv){
	90	if(cnv->extraInfo != NULL) {
	91	ucnv_close (((UConverterDataHZ *) (cnv->extraInfo))->gbConverter);
	92	if(!cnv->isExtraLocal) {
	93	uprv_free(cnv->extraInfo);
	94	}
	95	cnv->extraInfo = NULL;
	96	}
	97	}
	98
	99	static void
	100	_HZReset(UConverter *cnv, UConverterResetChoice choice){
	101	if(choice<=UCNV_RESET_TO_UNICODE) {
	102	cnv->toUnicodeStatus = 0;
	103	cnv->mode=0;
	104	if(cnv->extraInfo != NULL){
	105	((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
	106	}
	107	}
	108	if(choice!=UCNV_RESET_TO_UNICODE) {
	109	cnv->fromUnicodeStatus= 0;
	110	cnv->fromUChar32=0x0000;
	111	if(cnv->extraInfo != NULL){
	112	((UConverterDataHZ*)cnv->extraInfo)->isEscapeAppended = FALSE;
	113	((UConverterDataHZ*)cnv->extraInfo)->targetIndex = 0;
	114	((UConverterDataHZ*)cnv->extraInfo)->sourceIndex = 0;
	115	((UConverterDataHZ*)cnv->extraInfo)->isTargetUCharDBCS = FALSE;
	116	}
	117	}
	118	}
	119
	120	/************************************HZ Encoding***********************************************
	121	* Rules for HZ encoding
	122	*
	123	* In ASCII mode, a byte is interpreted as an ASCII character, unless a
	124	* '~' is encountered. The character '~' is an escape character. By
	125	* convention, it must be immediately followed ONLY by '~', '{' or '\n'
	126	* (<LF>), with the following special meaning.
	127
	128	* 1. The escape sequence '~~' is interpreted as a '~'.
	129	* 2. The escape-to-GB sequence '~{' switches the mode from ASCII to GB.
	130	* 3. The escape sequence '~\n' is a line-continuation marker to be
	131	* consumed with no output produced.
	132	* In GB mode, characters are interpreted two bytes at a time as (pure)
	133	* GB codes until the escape-from-GB code '~}' is read. This code
	134	* switches the mode from GB back to ASCII. (Note that the escape-
	135	* from-GB code '~}' ($7E7D) is outside the defined GB range.)
	136	*
	137	* Source: RFC 1842
	138	*/
	139
	140
	141	static void
	142	UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
	143	UErrorCode* err){
	144	char tempBuf[2];
	145	const char mySource = ( char ) args->source;
	146	UChar *myTarget = args->target;
	147	const char *mySourceLimit = args->sourceLimit;
	148	UChar32 targetUniChar = 0x0000;
	149	UChar mySourceChar = 0x0000;
	150	UConverterDataHZ* myData=(UConverterDataHZ*)(args->converter->extraInfo);
	151
	152	if ((args->converter == NULL) \|\| (args->targetLimit < args->target) \|\| (mySourceLimit < args->source)){
	153	*err = U_ILLEGAL_ARGUMENT_ERROR;
	154	return;
	155	}
	156
	157	while(mySource< mySourceLimit){
	158
	159	if(myTarget < args->targetLimit){
	160
	161	mySourceChar= (unsigned char) *mySource++;
	162
	163	switch(mySourceChar){
	164	case 0x0A:
	165	if(args->converter->mode ==UCNV_TILDE){
	166	args->converter->mode=0;
	167
	168	}
	169	*(myTarget++)=(UChar)mySourceChar;
	170	continue;
	171
	172	case UCNV_TILDE:
	173	if(args->converter->mode ==UCNV_TILDE){
	174	*(myTarget++)=(UChar)mySourceChar;
	175	args->converter->mode=0;
	176	continue;
	177
	178	}
	179	else if(args->converter->toUnicodeStatus !=0){
	180	args->converter->mode=0;
	181	break;
	182	}
	183	else{
	184	args->converter->mode = UCNV_TILDE;
	185	continue;
	186	}
	187
	188
	189	case UCNV_OPEN_BRACE:
	190	if(args->converter->mode == UCNV_TILDE){
	191	args->converter->mode=0;
	192	myData->isStateDBCS = TRUE;
	193	continue;
	194	}
	195	else{
	196	break;
	197	}
	198
	199
	200	case UCNV_CLOSE_BRACE:
	201	if(args->converter->mode == UCNV_TILDE){
	202	args->converter->mode=0;
	203	myData->isStateDBCS = FALSE;
	204	continue;
	205	}
	206	else{
	207	break;
	208	}
	209
	210	default:
	211	/* if the first byte is equal to TILDE and the trail byte
	212	* is not a valid byte then it is an error condition
	213	*/
	214	if(args->converter->mode == UCNV_TILDE){
	215	args->converter->mode=0;
	216	mySourceChar= (UChar)(((UCNV_TILDE+0x80) << 8) \| ((mySourceChar & 0x00ff)+0x80));
	217	goto SAVE_STATE;
	218	}
	219
	220	break;
	221
	222	}
	223
	224	if(myData->isStateDBCS){
	225	if(args->converter->toUnicodeStatus == 0x00){
	226	args->converter->toUnicodeStatus = (UChar) mySourceChar;
	227	continue;
	228	}
	229	else{
	230	tempBuf[0] = (char) (args->converter->toUnicodeStatus+0x80) ;
	231	tempBuf[1] = (char) (mySourceChar+0x80);
	232	mySourceChar= (UChar)(((args->converter->toUnicodeStatus+0x80) << 8) \| ((mySourceChar & 0x00ff)+0x80));
	233	args->converter->toUnicodeStatus =0x00;
	234	targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
	235	tempBuf, 2, args->converter->useFallback);
	236	}
	237	}
	238	else{
	239	if(args->converter->fromUnicodeStatus == 0x00){
	240	targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->gbConverter->sharedData,
	241	mySource - 1, 1, args->converter->useFallback);
	242	}
	243	else{
	244	goto SAVE_STATE;
	245	}
	246
	247	}
	248	if(targetUniChar < 0xfffe){
	249	if(args->offsets) {
	250	args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 1-(myData->isStateDBCS));
	251	}
	252
	253	*(myTarget++)=(UChar)targetUniChar;
	254	}
	255	else if(targetUniChar>=0xfffe){
	256	SAVE_STATE:
	257	if(targetUniChar == 0xfffe){
	258	*err = U_INVALID_CHAR_FOUND;
	259	}
	260	else{
	261	*err = U_ILLEGAL_CHAR_FOUND;
	262	}
	263	if(myData->isStateDBCS){
	264	args->converter->toUBytes[0] = (uint8_t)(tempBuf[0]-0x80);
	265	args->converter->toUBytes[1] = (uint8_t)(tempBuf[1]-0x80);
	266	args->converter->toULength=2;
	267	}
	268	else{
	269	args->converter->toUBytes[0] = (uint8_t)mySourceChar;
	270	args->converter->toULength=1;
	271	}
	272	break;
	273	}
	274	}
	275	else{
	276	*err =U_BUFFER_OVERFLOW_ERROR;
	277	break;
	278	}
	279	}
	280
	281	args->target = myTarget;
	282	args->source = mySource;
	283	}
	284
	285
	286	static void
	287	UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
	288	UErrorCode * err){
	289	const UChar *mySource = args->source;
	290	char *myTarget = args->target;
	291	int32_t* offsets = args->offsets;
	292	int32_t mySourceIndex = 0;
	293	int32_t myTargetIndex = 0;
	294	int32_t targetLength = (int32_t)(args->targetLimit - myTarget);
	295	int32_t mySourceLength = (int32_t)(args->sourceLimit - args->source);
	296	int32_t length=0;
	297	uint32_t targetUniChar = 0x0000;
	298	UChar32 mySourceChar = 0x0000,c=0x0000;
	299	UConverterDataHZ myConverterData=(UConverterDataHZ)args->converter->extraInfo;
	300	UBool isTargetUCharDBCS = (UBool) myConverterData->isTargetUCharDBCS;
	301	UBool oldIsTargetUCharDBCS = isTargetUCharDBCS;
	302	UBool isEscapeAppended =FALSE;
	303	int len =0;
	304	const char* escSeq=NULL;
	305
	306	if ((args->converter == NULL) \|\| (args->targetLimit < myTarget) \|\| (args->sourceLimit < args->source)){
	307	*err = U_ILLEGAL_ARGUMENT_ERROR;
	308	return;
	309	}
	310	if(args->converter->fromUChar32!=0 && myTargetIndex < targetLength) {
	311	goto getTrail;
	312	}
	313	/writing the char to the output stream /
	314	while (mySourceIndex < mySourceLength){
	315	targetUniChar = missingCharMarker;
	316	if (myTargetIndex < targetLength){
	317
	318	c=mySourceChar = (UChar) mySource[mySourceIndex++];
	319
	320
	321	oldIsTargetUCharDBCS = isTargetUCharDBCS;
	322	if(mySourceChar ==UCNV_TILDE){
	323	/concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);/
	324	len = ESC_LEN;
	325	escSeq = TILDE_ESCAPE;
	326	CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
	327	continue;
	328	}
	329	else{
	330	length= ucnv_MBCSFromUChar32(myConverterData->gbConverter->sharedData,
	331	mySourceChar,&targetUniChar,args->converter->useFallback);
	332
	333	}
	334	/* only DBCS or SBCS characters are expected*/
	335	/* DB haracters with high bit set to 1 are expected */
	336	if(length > 2 \|\| length==0 \|\|(((targetUniChar & 0x8080) != 0x8080)&& length==2)){
	337	targetUniChar= missingCharMarker;
	338	}
	339	if (targetUniChar != missingCharMarker){
	340	myConverterData->isTargetUCharDBCS = isTargetUCharDBCS = (UBool)(targetUniChar>0x00FF);
	341	if(oldIsTargetUCharDBCS != isTargetUCharDBCS \|\| !myConverterData->isEscapeAppended ){
	342	/Shifting from a double byte to single byte mode/
	343	if(!isTargetUCharDBCS){
	344	len =ESC_LEN;
	345	escSeq = SB_ESCAPE;
	346	CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
	347	myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
	348	}
	349	else{ /* Shifting from a single byte to double byte mode*/
	350	len =ESC_LEN;
	351	escSeq = DB_ESCAPE;
	352	CONCAT_ESCAPE_MACRO(args, myTargetIndex, targetLength, escSeq,err,len,mySourceIndex);
	353	myConverterData->isEscapeAppended =isEscapeAppended =TRUE;
	354
	355	}
	356	}
	357
	358	if(isTargetUCharDBCS){
	359	if( myTargetIndex <targetLength){
	360	myTarget[myTargetIndex++] =(char) ((targetUniChar >> 8) -0x80);
	361	if(offsets){
	362	*(offsets++) = mySourceIndex-1;
	363	}
	364	if(myTargetIndex < targetLength){
	365	myTarget[myTargetIndex++] =(char) ((targetUniChar & 0x00FF) -0x80);
	366	if(offsets){
	367	*(offsets++) = mySourceIndex-1;
	368	}
	369	}else{
	370	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
	371	*err = U_BUFFER_OVERFLOW_ERROR;
	372	}
	373	}else{
	374	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =(char) ((targetUniChar >> 8) -0x80);
	375	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) ((targetUniChar & 0x00FF) -0x80);
	376	*err = U_BUFFER_OVERFLOW_ERROR;
	377	}
	378
	379	}else{
	380	if( myTargetIndex <targetLength){
	381	myTarget[myTargetIndex++] = (char) (targetUniChar );
	382	if(offsets){
	383	*(offsets++) = mySourceIndex-1;
	384	}
	385
	386	}else{
	387	args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (char) targetUniChar;
	388	*err = U_BUFFER_OVERFLOW_ERROR;
	389	}
	390	}
	391
	392	}
	393	else{
	394	/* oops.. the code point is unassigned */
	395	/Handle surrogates /
	396	/check if the char is a First surrogate/
	397	if(UTF_IS_SURROGATE(mySourceChar)) {
	398	if(UTF_IS_SURROGATE_FIRST(mySourceChar)) {
	399	args->converter->fromUChar32=mySourceChar;
	400	getTrail:
	401	/look ahead to find the trail surrogate/
	402	if(mySourceIndex < mySourceLength) {
	403	/* test the following code unit */
	404	UChar trail=(UChar) args->source[mySourceIndex];
	405	if(UTF_IS_SECOND_SURROGATE(trail)) {
	406	++mySourceIndex;
	407	mySourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUChar32, trail);
	408	args->converter->fromUChar32=0x00;
	409	/* there are no surrogates in GB2312*/
	410	*err = U_INVALID_CHAR_FOUND;
	411	/* exit this condition tree */
	412	} else {
	413	/* this is an unmatched lead code unit (1st surrogate) */
	414	/* callback(illegal) */
	415	*err=U_ILLEGAL_CHAR_FOUND;
	416	}
	417	} else {
	418	/* no more input */
	419	*err = U_ZERO_ERROR;
	420	}
	421	} else {
	422	/* this is an unmatched trail code unit (2nd surrogate) */
	423	/* callback(illegal) */
	424	*err=U_ILLEGAL_CHAR_FOUND;
	425	}
	426	} else {
	427	/* callback(unassigned) for a BMP code point */
	428	*err = U_INVALID_CHAR_FOUND;
	429	}
	430
	431	args->converter->fromUChar32=mySourceChar;
	432	break;
	433	}
	434	}
	435	else{
	436	*err = U_BUFFER_OVERFLOW_ERROR;
	437	break;
	438	}
	439	targetUniChar=missingCharMarker;
	440	}
	441
	442	args->target += myTargetIndex;
	443	args->source += mySourceIndex;
	444	myConverterData->isTargetUCharDBCS = isTargetUCharDBCS;
	445	}
	446
	447	static void
	448	_HZ_WriteSub(UConverterFromUnicodeArgs args, int32_t offsetIndex, UErrorCode err) {
	449	UConverter *cnv = args->converter;
	450	UConverterDataHZ convData=(UConverterDataHZ ) cnv->extraInfo;
	451	char *p;
	452	char buffer[4];
	453	p = buffer;
	454
	455	if( convData->isTargetUCharDBCS){
	456	*p++= UCNV_TILDE;
	457	*p++= UCNV_CLOSE_BRACE;
	458	convData->isTargetUCharDBCS=FALSE;
	459	}
	460	*p++= cnv->subChar[0];
	461
	462	ucnv_cbFromUWriteBytes(args,
	463	buffer, (int32_t)(p - buffer),
	464	offsetIndex, err);
	465	}
	466
	467	/* structure for SafeClone calculations */
	468	struct cloneHZStruct
	469	{
	470	UConverter cnv;
	471	UAlignedMemory deadSpace1;
	472	UConverter subCnv;
	473	UAlignedMemory deadSpace2;
	474	UConverterDataHZ mydata;
	475	};
	476
	477
	478	static UConverter *
	479	_HZ_SafeClone(const UConverter *cnv,
	480	void *stackBuffer,
	481	int32_t *pBufferSize,
	482	UErrorCode *status)
	483	{
	484	struct cloneHZStruct * localClone;
	485	int32_t size, bufferSizeNeeded = sizeof(struct cloneHZStruct);
	486
	487	if (U_FAILURE(*status)){
	488	return 0;
	489	}
	490
	491	if (pBufferSize == 0){ / 'preflighting' request - set needed size into pBufferSize /
	492	*pBufferSize = bufferSizeNeeded;
	493	return 0;
	494	}
	495
	496	localClone = (struct cloneHZStruct *)stackBuffer;
	497	uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
	498
	499	uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataHZ));
	500	localClone->cnv.extraInfo = &localClone->mydata;
	501	localClone->cnv.isExtraLocal = TRUE;
	502
	503	/* deep-clone the sub-converter */
	504	size = (int32_t)sizeof(UConverter);
	505	((UConverterDataHZ*)localClone->cnv.extraInfo)->gbConverter =
	506	ucnv_safeClone(((UConverterDataHZ*)cnv->extraInfo)->gbConverter, &localClone->subCnv, &size, status);
	507
	508	return &localClone->cnv;
	509	}
	510
	511	static void
	512	_HZ_GetUnicodeSet(const UConverter *cnv,
	513	USetAdder *sa,
	514	UConverterUnicodeSet which,
	515	UErrorCode *pErrorCode) {
	516	/* the tilde '~' is hardcoded in the converter */
	517	sa->add(sa->set, 0x7e);
	518
	519	/* add all of the code points that the sub-converter handles */
	520	((UConverterDataHZ*)cnv->extraInfo)->
	521	gbConverter->sharedData->impl->
	522	getUnicodeSet(((UConverterDataHZ*)cnv->extraInfo)->gbConverter,
	523	sa, which, pErrorCode);
	524	}
	525
	526	static const UConverterImpl _HZImpl={
	527
	528	UCNV_HZ,
	529
	530	NULL,
	531	NULL,
	532
	533	_HZOpen,
	534	_HZClose,
	535	_HZReset,
	536
	537	UConverter_toUnicode_HZ_OFFSETS_LOGIC,
	538	UConverter_toUnicode_HZ_OFFSETS_LOGIC,
	539	UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
	540	UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
	541	NULL,
	542
	543	NULL,
	544	NULL,
	545	_HZ_WriteSub,
	546	_HZ_SafeClone,
	547	_HZ_GetUnicodeSet
	548	};
	549
	550	static const UConverterStaticData _HZStaticData={
	551	sizeof(UConverterStaticData),
	552	"HZ",
	553	0,
	554	UCNV_IBM,
	555	UCNV_HZ,
	556	1,
	557	4,
	558	{ 0x1a, 0, 0, 0 },
	559	1,
	560	FALSE,
	561	FALSE,
	562	0,
	563	0,
	564	{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
	565
	566	};
	567
	568
	569	const UConverterSharedData _HZData={
	570	sizeof(UConverterSharedData),
	571	~((uint32_t) 0),
	572	NULL,
	573	NULL,
	574	&_HZStaticData,
	575	FALSE,
	576	&_HZImpl,
	577	0
	578	};
	579
	580	#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */