Patch: Multi-encoding Text import/export

Subject: Patch: Multi-encoding Text import/export
From: Andrew Dunbar (hippietrail@yahoo.com)
Date: Sat May 19 2001 - 03:19:21 CDT

sorted by: [ date ] [ thread ] [ subject ] [ author ]
Next message: Martin Sevior: "RE: Styles again."
Previous message: ha shao: "Re: cjk support on unix is broken"
Next in thread: Sam TH: "Re: Patch: Multi-encoding Text import/export"

I consider this a pretty important change.

It allows you to import a text file no matter if
it's an old 8-bit encoding, UTF-8, or UCS-2 as is
used in Windows and Mac OSX.

It also allows you to export to any of these text
formats - though changes are needed to the rest of
AbiWord to fully support this.

This also means we will no longer need separate
UTF-8 and UCS-2 importers and exporters and any
.txt file will "just work" - perfect for church
secretaries (:

Please somebody have a serious look at this!
Feedback much appreciated.

Andrew Dunbar.

-- http://linguaphile.sourceforge.net

Index: src/af/util/xp/ut_mbtowc.cpp =================================================================== RCS file: /cvsroot/abi/src/af/util/xp/ut_mbtowc.cpp,v retrieving revision 1.17 diff -u -r1.17 ut_mbtowc.cpp --- src/af/util/xp/ut_mbtowc.cpp 2001/05/03 21:11:37 1.17 +++ src/af/util/xp/ut_mbtowc.cpp 2001/05/19 07:27:00 @@ -22,6 +22,8 @@ #include <limits.h> #include "ut_mbtowc.h" +// UTF-8 can use up to 6 bytes +#define MY_MB_LEN_MAX 6 #if 0 /* big if 0 */ #if defined(__OpenBSD__) || defined(__FreeBSD__) @@ -191,7 +193,7 @@ int UT_Mbtowc::mbtowc(wchar_t &wc,char mb) { - if(++m_bufLen>MB_LEN_MAX) + if(++m_bufLen>MY_MB_LEN_MAX) { initialize(); return 0; @@ -202,7 +204,7 @@ #else size_t thisLen=mbrtowc(&wc,m_buf,m_bufLen,&m_state); #endif - if(thisLen>MB_LEN_MAX)return 0; + if(thisLen>MY_MB_LEN_MAX)return 0; if(thisLen==0)thisLen=1; m_bufLen-=thisLen; return 1; @@ -229,6 +231,12 @@ cd = iconv_open("UCS-2", charset ); }; +UT_Mbtowc::UT_Mbtowc(const char* from_charset): m_bufLen(0) +{ + cd = iconv_open("UCS-2", from_charset); + UT_ASSERT(cd != (iconv_t)-1); +}; + UT_Mbtowc::UT_Mbtowc(): m_bufLen(0) { cd = iconv_open("UCS-2", XAP_EncodingManager::get_instance()->getNativeEncodingName() ); @@ -250,7 +258,7 @@ int UT_Mbtowc::mbtowc(wchar_t &wc,char mb) { - if(++m_bufLen>MB_LEN_MAX) { + if(++m_bufLen>MY_MB_LEN_MAX) { initialize(); return 0; } Index: src/af/util/xp/ut_mbtowc.h =================================================================== RCS file: /cvsroot/abi/src/af/util/xp/ut_mbtowc.h,v retrieving revision 1.9 diff -u -r1.9 ut_mbtowc.h --- src/af/util/xp/ut_mbtowc.h 2000/11/04 04:54:56 1.9 +++ src/af/util/xp/ut_mbtowc.h 2001/05/19 07:27:00 @@ -49,6 +49,7 @@ public: void initialize(); UT_Mbtowc(); + UT_Mbtowc(const char* from_charset); UT_Mbtowc(const UT_Mbtowc& v); ~UT_Mbtowc(); int mbtowc(wchar_t &wc,char mb); Index: src/wp/impexp/xp/ie_exp.cpp =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp.cpp,v retrieving revision 1.46 diff -u -r1.46 ie_exp.cpp --- src/wp/impexp/xp/ie_exp.cpp 2001/05/05 20:08:13 1.46 +++ src/wp/impexp/xp/ie_exp.cpp 2001/05/19 07:27:31 @@ -109,7 +109,8 @@ // TODO add code to make a backup of the original file, if it exists. #ifndef HAVE_GNOMEVFS - m_fp = fopen(szFilename,"w"); + // Open file in binary mode or UCS-2 output will be mangled. + m_fp = fopen(szFilename,"wb"); return (m_fp != 0); #else GnomeVFSResult result; Index: src/wp/impexp/xp/ie_exp_Text.cpp =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.cpp,v retrieving revision 1.23 diff -u -r1.23 ie_exp_Text.cpp --- src/wp/impexp/xp/ie_exp_Text.cpp 2001/05/05 20:08:13 1.23 +++ src/wp/impexp/xp/ie_exp_Text.cpp 2001/05/19 07:27:39 @@ -34,6 +34,8 @@ #include "ut_string_class.h" +#define MY_MB_LEN_MAX 6 + ////////////////////////////////////////////////////////////////// // a private listener class to help us translate the document // into a text stream. code is at the bottom of this file. @@ -70,12 +72,18 @@ protected: void _closeBlock(void); void _outputData(const UT_UCSChar * p, UT_uint32 length); + void _output8BitData(const UT_UCSChar * , UT_uint32 length); + void _output16BitData(const UT_UCSChar * , UT_uint32 length); PD_Document * m_pDocument; IE_Exp_Text * m_pie; bool m_bInBlock; bool m_bToClipboard; - UT_Wctomb m_wctomb; + bool m_bFirstWrite; + UT_Wctomb m_wctomb; + const char * m_szEncoding; + bool m_bBigEndian; + bool m_bUseBOM; }; /*****************************************************************/ @@ -144,34 +152,56 @@ if (!m_bInBlock) return; -#ifdef WIN32 // we need to generate CRLFs on Win32 - if (m_bToClipboard) // when writing to the clipboard. we - m_pie->write("\r"); // use text mode when going to a file -#endif // so we don't need to then. + // TODO All writes should be re-routed via iconv since UCS-2 + // TODO uses two bytes for each character. + // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. +#ifdef WIN32 + m_pie->write("\r\n"); +#else m_pie->write("\n"); +#endif m_bInBlock = false; return; } void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length) { + if (m_szEncoding && !strncmp(m_szEncoding,"UCS-2",5)) + _output16BitData(data, length); + else + _output8BitData(data, length); +} + +void s_Text_Listener::_output8BitData(const UT_UCSChar * data, UT_uint32 length) +{ UT_String sBuf; const UT_UCSChar * pData; int mbLen; - char pC[MB_LEN_MAX]; - + char pC[MY_MB_LEN_MAX]; + UT_ASSERT(sizeof(UT_Byte) == sizeof(char)); + if (m_bFirstWrite) + { + if (m_szEncoding) + m_wctomb.setOutCharset(m_szEncoding); + if (m_bUseBOM) + { + // TODO There may be reason for using a BOM in UTF-8 text. + // TODO I've seen MS software do it. + } + m_bFirstWrite = false; + } + for (pData=data; (pData<data+length); /**/) { if(!m_wctomb.wctomb(pC,mbLen,(wchar_t)*pData)) { - mbLen=1; - pC[0]='?'; - m_wctomb.initialize(); + mbLen=1; + pC[0]='?'; + m_wctomb.initialize(); } - pData++; if (mbLen>1) { sBuf += pC; @@ -179,17 +209,89 @@ else { // We let any UCS_LF's (forced line breaks) go out as is. + // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. #ifdef WIN32 - if (m_bToClipboard && pC[0]==UCS_LF) + if (pC[0]==UCS_LF) sBuf += "\r"; #endif sBuf += (char)pC[0]; } + pData++; } m_pie->write(sBuf.c_str(),sBuf.size()); } +void s_Text_Listener::_output16BitData(const UT_UCSChar * data, UT_uint32 length) +{ + const UT_UCSChar * pInData; + char * pOutData; + + int mbLen; + unsigned char pC[MY_MB_LEN_MAX]; + char * pConvertedData = 0; + + UT_ASSERT(sizeof(UT_Byte) == sizeof(char)); + + pConvertedData = new char[length * sizeof(UT_UCSChar)]; + pOutData = pConvertedData; + + UT_ASSERT(pConvertedData); + + if (m_bFirstWrite) + { + if (m_szEncoding) + m_wctomb.setOutCharset(m_szEncoding); + if (m_bUseBOM) + { + if (m_bBigEndian) + m_pie->write("\xfe\xff",2); + else + m_pie->write("\xff\xfe",2); + } + m_bFirstWrite = false; + } + + for (pInData=data; (pInData<data+length); /**/) + { + if(!m_wctomb.wctomb(reinterpret_cast<char *>(pC),mbLen,(wchar_t)*pInData)) + { + // TODO U+FFFD "REPLACEMENT CHARACTER" is the + // TODO correct unicode equivalent of '?' isn't it? + mbLen=2; + if (m_bBigEndian) + { + pC[0]=0xff; + pC[1]=0xfd; + } + else + { + pC[0]=0xfd; + pC[1]=0xff; + } + m_wctomb.initialize(); + } + // We let any UCS_LF's (forced line breaks) go out as is. + if (*pInData == UCS_LF) + { + // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. +#ifdef WIN32 + // TODO Win needs to *insert* an extra CR character before the LF. + // TODO The old 8-bit code used UT_String which could grow dynamically + // TODO but the 16-bit code uses a fixed size buffer. + // TODO What is an appropriate solution? +#endif + } + *pOutData++ = pC[0]; + *pOutData++ = pC[1]; + ++pInData; + } + + m_pie->write(pConvertedData,length * sizeof(UT_UCSChar)); + + delete [] pConvertedData; +} + s_Text_Listener::s_Text_Listener(PD_Document * pDocument, IE_Exp_Text * pie, bool bToClipboard) @@ -201,6 +303,10 @@ // assume that we are starting in the middle of a block. // when going to a file we should not. m_bInBlock = m_bToClipboard; + m_bFirstWrite = true; + m_szEncoding = 0; + m_bBigEndian = true; + m_bUseBOM = false; } s_Text_Listener::~s_Text_Listener() Index: src/wp/impexp/xp/ie_imp.cpp =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp.cpp,v retrieving revision 1.40 diff -u -r1.40 ie_imp.cpp --- src/wp/impexp/xp/ie_imp.cpp 2001/05/07 16:50:43 1.40 +++ src/wp/impexp/xp/ie_imp.cpp 2001/05/19 07:27:40 @@ -176,7 +176,8 @@ char szBuf[4096]; // 4096 ought to be enough int iNumbytes; FILE *f; - if ( ( f = fopen( szFilename, "r" ) ) != (FILE *)0 ) + // we must open in binary mode for UCS-2 compatibility + if ( ( f = fopen( szFilename, "rb" ) ) != (FILE *)0 ) { iNumbytes = fread(szBuf, 1, sizeof(szBuf), f); fclose(f); Index: src/wp/impexp/xp/ie_imp_Text.cpp =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.cpp,v retrieving revision 1.24 diff -u -r1.24 ie_imp_Text.cpp --- src/wp/impexp/xp/ie_imp_Text.cpp 2001/05/03 00:45:36 1.24 +++ src/wp/impexp/xp/ie_imp_Text.cpp 2001/05/19 07:28:01 @@ -29,21 +29,177 @@ #include "ut_growbuf.h" #include "xap_EncodingManager.h" +// TODO Can we make these members of the importer or the sniffer? +enum UCS2_Endian { UE_BigEnd = -1, UE_NotUCS = 0, UE_LittleEnd }; + +static bool _recognizeUTF8 (const char * szBuf, + UT_uint32 iNumbytes); +static UCS2_Endian _recognizeUCS2 (const char * szBuf, + UT_uint32 iNumbytes, + bool bDeep); + /*****************************************************************/ /*****************************************************************/ bool IE_Imp_Text_Sniffer::recognizeContents(const char * szBuf, UT_uint32 iNumbytes) { - // We give the other guys a chance, since this - // importer is so generic. - return false; + // TODO It may or may not be worthwhile trying to guess CJK encodings. + + bool bSuccess = false; + + bSuccess = _recognizeUTF8(szBuf, iNumbytes); + + if (bSuccess == false) + { + if (_recognizeUCS2(szBuf, iNumbytes, false) != UE_NotUCS) + { + bSuccess = true; + } + } + + return bSuccess; +} + +static bool _recognizeUTF8(const char * szBuf, + UT_uint32 iNumbytes) +{ + bool bSuccess = false; + const unsigned char *p = reinterpret_cast<const unsigned char *>(szBuf); + + while (p < reinterpret_cast<const unsigned char *>(szBuf + iNumbytes)) + { + UT_sint32 iLen; + + if ((*p & 0x80) == 0) // ASCII + { + ++p; + continue; + } + else if ((*p & 0xc0) == 0x80) // not UTF-8 + { + return false; + } + else if (*p == 0xfe || *p == 0xff) + { + // BOM shouldn't occur in UTF-8 - file may be UCS-2 + return false; + } + else if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence + iLen = 6; + else if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence + iLen = 5; + else if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence + iLen = 4; + else if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence + iLen = 3; + else if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence + iLen = 2; + else + { + // the above code covers all cases - if we reach here the logic is wrong + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + return false; + } + + while (--iLen) + { + ++p; + if (p >= reinterpret_cast<const unsigned char *>(szBuf + iNumbytes)) + { + //UT_DEBUGMSG((" out of data!\n")); + break; + } + if ((*p & 0xc0) != 0x80) + return false; + } + // all bytes in sequence were ok + bSuccess = true; + ++p; + } + + return bSuccess; } +static UCS2_Endian _recognizeUCS2(const char * szBuf, + UT_uint32 iNumbytes, + bool bDeep) +{ + UCS2_Endian eResult = UE_NotUCS; + + if (iNumbytes >= 2) + { + const unsigned char *p = reinterpret_cast<const unsigned char *>(szBuf); + + // Big endian ? + if (p[0] == 0xfe && p[1] == 0xff) + eResult = UE_BigEnd; + + // Little endian + else if (p[0] == 0xff && p[1] == 0xfe) + eResult = UE_LittleEnd; + + if (eResult == UE_NotUCS && bDeep) + { + // If we know this is a text file, know it isn't UTF-8, and it doesn't + // begin with a BOM, let's try a couple of heuristics too see if it + // might be a UCS-2 file without a BOM. + // Since CR and LF are very common and their endian-swapped counterparts + // are reserved in Unicode, they should only exist in big endian or + // little endian but not both. + // If there are no CRs or LFs we fall back on counting how many characters + // fall within the ASCII range for both endians. The one with the higher + // count wins. + // Text files which contain NUL characters will be wrongly identified as + // UCS-2 using this technique. + + UT_sint32 iLineEndBE = 0; + UT_sint32 iLineEndLE = 0; + UT_sint32 iAsciiBE = 0; + UT_sint32 iAsciiLE = 0; + + // Count all CR, LF, and ASCII range characters. + for (p = reinterpret_cast<const unsigned char *>(szBuf); + p < reinterpret_cast<const unsigned char *>(szBuf + iNumbytes - 1); + p += 2) + { + // A 16-bit null character probably won't exist in a UCS-2 file + if (p[0] == 0 && p[1] == 0) + break; + if (p[0] == 0) + { + ++iAsciiBE; + if (p[1] == 0x0A || p[1] == 0x0D) + ++iLineEndBE; + } + if (p[1] == 0) + { + ++iAsciiLE; + if (p[0] == 0x0A || p[0] == 0x0D) + ++iLineEndLE; + } + } + + // Take an educated guess. + if (iLineEndBE && !iLineEndLE) + eResult = UE_BigEnd; + else if (iLineEndLE && !iLineEndBE) + eResult = UE_LittleEnd; + else if (!iLineEndBE && !iLineEndLE) + { + if (iAsciiBE > iAsciiLE) + eResult = UE_BigEnd; + else if (iAsciiLE > iAsciiBE) + eResult = UE_LittleEnd; + } + } + } + + return eResult; +} + bool IE_Imp_Text_Sniffer::recognizeSuffix(const char * szSuffix) { - // We give the other guys a chance, since this - // importer is so generic. return (!UT_stricmp (szSuffix, ".txt") || !UT_stricmp(szSuffix, ".text")); } @@ -69,10 +225,9 @@ /*****************************************************************/ /* - Import US-ASCII (actually Latin-1) data from a plain - text file. We allow either LF or CR or CRLF line - termination. Each line terminator is taken to be a - paragraph break. + Import data from a plain text file. We allow either + LF or CR or CRLF line termination. Each line + terminator is taken to be a paragraph break. */ /*****************************************************************/ @@ -82,7 +237,8 @@ UT_Error IE_Imp_Text::importFile(const char * szFilename) { - FILE *fp = fopen(szFilename, "r"); + // We must open in binary mode for UCS-2 compatibility. + FILE *fp = fopen(szFilename, "rb"); if (!fp) { UT_DEBUGMSG(("Could not open file %s\n",szFilename)); @@ -91,6 +247,9 @@ UT_Error error; + // First we need to determine the encoding. + // TODO We might want to find a way to combine this with recognizeContents(). + X_CleanupIfError(error,_recognizeEncoding(fp)); X_CleanupIfError(error,_writeHeader(fp)); X_CleanupIfError(error,_parseFile(fp)); @@ -113,6 +272,7 @@ IE_Imp_Text::IE_Imp_Text(PD_Document * pDocument) : IE_Imp(pDocument) { + m_szEncoding = 0; } /*****************************************************************/ @@ -121,6 +281,37 @@ #define X_ReturnIfFail(exp,error) do { bool b = (exp); if (!b) return (error); } while (0) #define X_ReturnNoMemIfError(exp) X_ReturnIfFail(exp,UT_IE_NOMEMORY) +UT_Error IE_Imp_Text::_recognizeEncoding(FILE * fp) +{ + char szBuf[4096]; // 4096 ought to be enough + UT_sint32 iNumbytes; + + iNumbytes = fread(szBuf, 1, sizeof(szBuf), fp); + fseek(fp, 0, SEEK_SET); + + if (_recognizeUTF8(szBuf, iNumbytes)) + { + m_szEncoding = "UTF-8"; + } + else + { + UCS2_Endian eUcs2 = UE_NotUCS; + + eUcs2 = _recognizeUCS2(szBuf, iNumbytes, true); + + if (eUcs2 == UE_BigEnd) + { + m_szEncoding = "UCS-2-BE"; + } + else if (eUcs2 == UE_LittleEnd) + { + m_szEncoding = "UCS-2-LE"; + } + } + + return UT_OK; +} + UT_Error IE_Imp_Text::_writeHeader(FILE * /* fp */) { X_ReturnNoMemIfError(m_pDocument->appendStrux(PTX_Section, NULL)); @@ -137,6 +328,9 @@ UT_UCSChar c; wchar_t wc; + if (m_szEncoding) + m_Mbtowc.setInCharset(m_szEncoding); + while (fread(&b, 1, sizeof(b), fp) > 0) { if(!m_Mbtowc.mbtowc(wc,b)) @@ -146,6 +340,8 @@ { case (UT_UCSChar)'\r': case (UT_UCSChar)'\n': + case 0x2028: // Unicode line separator + case 0x2029: // Unicode paragraph separator if ((c == (UT_UCSChar)'\n') && bEatLF) { @@ -158,7 +354,9 @@ bEatLF = true; } - // we interprete either CRLF, CR, or LF as a paragraph break. + // we interpret either CRLF, CR, or LF as a paragraph break. + // we also accept U+2028 (line separator) and U+2029 (para separator) + // especially since these are recommended by Mac OS X. // start a paragraph and emit any text that we // have accumulated. @@ -224,6 +422,8 @@ { case (UT_UCSChar)'\r': case (UT_UCSChar)'\n': + case 0x2028: // Unicode line separator + case 0x2029: // Unicode paragraph separator if ((c == (UT_UCSChar)'\n') && bEatLF) { bEatLF = false; @@ -235,7 +435,9 @@ bEatLF = true; } - // we interprete either CRLF, CR, or LF as a paragraph break. + // we interpret either CRLF, CR, or LF as a paragraph break. + // we also accept U+2028 (line separator) and U+2029 (para separator) + // especially since these are recommended by Mac OS X. if (gbBlock.getLength() > 0) { Index: src/wp/impexp/xp/ie_imp_Text.h =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_imp_Text.h,v retrieving revision 1.13 diff -u -r1.13 ie_imp_Text.h --- src/wp/impexp/xp/ie_imp_Text.h 2001/05/03 00:45:36 1.13 +++ src/wp/impexp/xp/ie_imp_Text.h 2001/05/19 07:28:02 @@ -58,9 +58,11 @@ unsigned char * pData, UT_uint32 lenData); protected: + UT_Error _recognizeEncoding(FILE * fp); UT_Error _parseFile(FILE * fp); UT_Error _writeHeader(FILE * fp); UT_Mbtowc m_Mbtowc; + const char * m_szEncoding; }; #endif /* IE_IMP_TEXT_H */

_________________________________________________________ Do You Yahoo!? Get your free @yahoo.com address at http://mail.yahoo.com

Next message: Martin Sevior: "RE: Styles again."
Previous message: ha shao: "Re: cjk support on unix is broken"
Next in thread: Sam TH: "Re: Patch: Multi-encoding Text import/export"

This archive was generated by hypermail 2b25 : Sat May 26 2001 - 03:51:05 CDT