mb to wc


Subject: mb to wc
From: hj (huangj@citiz.net)
Date: Fri Jan 28 2000 - 21:39:57 CST


Two bytes represent one Chinese character. So ie_imp_Text::_parseFile should
be:

#include<wchar.h>
#include<string.h>
#include<limits.h>

class HJ_Mbtowc
{
  char m_buf[MB_LEN_MAX];
  int m_bufLen;
  mbstate_t m_state;
public:
  void initialize()
        {
          memset(&m_state,'\0', sizeof (m_state));
          m_bufLen=0;
        }
  HJ_Mbtowc()
        {
          initialize();
        }
  int mbtowc(wchar_t &wc,char mb);
};

int HJ_Mbtowc::mbtowc(wchar_t &wc,char mb)
{
  if(++m_bufLen>MB_LEN_MAX)
        {
          initialize();
          return 0;
        }
  m_buf[m_bufLen-1]=mb;
  size_t thisLen=mbrtowc(&wc,m_buf,m_bufLen,&m_state);
  switch(thisLen)
        {
        case 0:
          thisLen=1;
          break;
        case 1:
        case 2:
          break;
        default:
          return 0;
        }
  m_bufLen-=thisLen;
  return 1;
}

EStatus IE_Imp_Text::_parseFile(FILE * fp)
{
  UT_GrowBuf gbBlock(1024);
  UT_Bool bEatLF = UT_FALSE;
  unsigned char c1;

  HJ_Mbtowc m;
  wchar_t wc;
  UT_UCSChar c;

  while (fread(&c1, 1, sizeof(c1), fp) > 0)
        {
          if(m.mbtowc(wc,c1)){
                c=(UT_UCSChar)wc;
                switch (c)
                  {
                  case (UT_UCSChar)'\r':
                  case (UT_UCSChar)'\n':
                        if ((c == (UT_UCSChar)'\n') && bEatLF)
                          {
                                bEatLF = UT_FALSE;
                                break;
                          }

                        if (c ==(UT_UCSChar) '\r')
                          {
                                bEatLF = UT_TRUE;
                          }

                        // we interprete either CRLF, CR, or LF as a
paragraph break.

                        // start a paragraph and emit any text that we
                        // have accumulated.

X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, NULL));
                        if (gbBlock.getLength() > 0)
                          {

X_ReturnNoMemIfError(m_pDocument->appendSpan(gbBlock.getPointer(0),
gbBlock.getLength()));
                                gbBlock.truncate(0);
                          }
                        break;

                  default:
                        bEatLF = UT_FALSE;

                        // deal with plain character.
                        // this cast is OK. we have US-ASCII (actually
Latin-1) character
                        // data, so we can do this.

                        // TODO consider scanning for UTF8...

                        UT_UCSChar uc = c;

X_ReturnNoMemIfError(gbBlock.ins(gbBlock.getLength(),&uc,1));
                        break;
                  }
          }
        }

  if (gbBlock.getLength() > 0)
        {
          // if we have text left over (without final CR/LF),
          // create a paragraph and emit the text now.
          X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Block, NULL));

X_ReturnNoMemIfError(m_pDocument->appendSpan(gbBlock.getPointer(0),
gbBlock.getLength()));
        }

  return IES_OK;
}



This archive was generated by hypermail 2b25 : Fri Jan 28 2000 - 21:45:58 CST