Subject: mb to wc
From: hj (huangj@citiz.net)
Date: Fri Jan 28 2000 - 21:39:57 CST
Two bytes represent one Chinese character. So ie_imp_Text::_parseFile should
be:
#include<wchar.h>
#include<string.h>
#include<limits.h>
class HJ_Mbtowc
{
char m_buf[MB_LEN_MAX];
int m_bufLen;
mbstate_t m_state;
public:
void initialize()
{
memset(&m_state,'\0', sizeof (m_state));
m_bufLen=0;
}
HJ_Mbtowc()
{
initialize();
}
int mbtowc(wchar_t &wc,char mb);
};
int HJ_Mbtowc::mbtowc(wchar_t &wc,char mb)
{
if(++m_bufLen>MB_LEN_MAX)
{
initialize();
return 0;
}
m_buf[m_bufLen-1]=mb;
size_t thisLen=mbrtowc(&wc,m_buf,m_bufLen,&m_state);
switch(thisLen)
{
case 0:
thisLen=1;
break;
case 1:
case 2:
break;
default:
return 0;
}
m_bufLen-=thisLen;
return 1;
}
EStatus IE_Imp_Text::_parseFile(FILE * fp)
{
UT_GrowBuf gbBlock(1024);
UT_Bool bEatLF = UT_FALSE;
unsigned char c1;
HJ_Mbtowc m;
wchar_t wc;
UT_UCSChar c;
while (fread(&c1, 1, sizeof(c1), fp) > 0)
{
if(m.mbtowc(wc,c1)){
c=(UT_UCSChar)wc;
switch (c)
{
case (UT_UCSChar)'\r':
case (UT_UCSChar)'\n':
if ((c == (UT_UCSChar)'\n') && bEatLF)
{
bEatLF = UT_FALSE;
break;
}
if (c ==(UT_UCSChar) '\r')
{
bEatLF = UT_TRUE;
}
// we interprete either CRLF, CR, or LF as a
paragraph break.
// start a paragraph and emit any text that we
// have accumulated.
X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, NULL));
if (gbBlock.getLength() > 0)
{
X_ReturnNoMemIfError(m_pDocument->appendSpan(gbBlock.getPointer(0),
gbBlock.getLength()));
gbBlock.truncate(0);
}
break;
default:
bEatLF = UT_FALSE;
// deal with plain character.
// this cast is OK. we have US-ASCII (actually
Latin-1) character
// data, so we can do this.
// TODO consider scanning for UTF8...
UT_UCSChar uc = c;
X_ReturnNoMemIfError(gbBlock.ins(gbBlock.getLength(),&uc,1));
break;
}
}
}
if (gbBlock.getLength() > 0)
{
// if we have text left over (without final CR/LF),
// create a paragraph and emit the text now.
X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, NULL));
X_ReturnNoMemIfError(m_pDocument->appendSpan(gbBlock.getPointer(0),
gbBlock.getLength()));
}
return IES_OK;
}
This archive was generated by hypermail 2b25 : Fri Jan 28 2000 - 21:45:58 CST