Subject: Patch: Text export / unicode clipboard
From: Andrew Dunbar (hippietrail@yahoo.com)
Date: Fri Jun 15 2001 - 02:10:37 CDT
Here is a much tidied-up text exporter. Removed a lot of special
cases, unicode clipboard now works for non-unicode locales, and
improved coding the style.
Andrew Dunbar.
-- http://linguaphile.sourceforge.net
Index: src/af/util/win/ut_Win32OS.h =================================================================== RCS file: /cvsroot/abi/src/af/util/win/ut_Win32OS.h,v retrieving revision 1.3 diff -u -r1.3 ut_Win32OS.h --- src/af/util/win/ut_Win32OS.h 2001/02/06 22:54:10 1.3 +++ src/af/util/win/ut_Win32OS.h 2001/06/15 06:50:05 @@ -21,6 +21,7 @@ #ifndef UT_Win32OS_H #define UT_Win32OS_H +#include <windows.h> #include "ut_types.h" bool UT_IsWinNT(void); Index: src/wp/ap/win/ap_Win32App.cpp =================================================================== RCS file: /cvsroot/abi/src/wp/ap/win/ap_Win32App.cpp,v retrieving revision 1.64 diff -u -r1.64 ap_Win32App.cpp --- src/wp/ap/win/ap_Win32App.cpp 2001/06/07 15:52:36 1.64 +++ src/wp/ap/win/ap_Win32App.cpp 2001/06/15 06:50:44 @@ -58,6 +58,7 @@ #include "ap_EditMethods.h" #include "fp_Run.h" +#include "ut_Win32OS.h" #include "ie_exp.h" #include "ie_exp_AbiWord_1.h" @@ -435,7 +436,8 @@ // put raw text on the clipboard - if (XAP_EncodingManager::get_instance()->isUnicodeLocale()) + // TODO Should use a finer-grain technique than IsWinNT() since Win98 supports unicode clipboard. + if (UT_IsWinNT()) { // put raw unicode text on the clipboard // TODO On NT we should always put unicode text on the clipboard regardless of locale. Index: src/wp/impexp/xp/ie_exp_Text.cpp =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.cpp,v retrieving revision 1.26 diff -u -r1.26 ie_exp_Text.cpp --- src/wp/impexp/xp/ie_exp_Text.cpp 2001/06/11 12:04:18 1.26 +++ src/wp/impexp/xp/ie_exp_Text.cpp 2001/06/15 06:50:54 @@ -1,19 +1,19 @@ /* AbiWord * Copyright (C) 1998 AbiSource, Inc. - * + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ @@ -33,14 +33,15 @@ #include "ut_iconv.h" #include "ut_wctomb.h" #include "xap_EncodingManager.h" - -#include "ut_string_class.h" - #include "ap_Dialog_Id.h" #include "xap_App.h" #include "xap_DialogFactory.h" #include "xap_Dlg_Encoding.h" +#ifdef WIN32 +#include "ut_Win32OS.h" +#endif + #define MY_MB_LEN_MAX 6 ////////////////////////////////////////////////////////////////// @@ -58,7 +59,7 @@ bool bIs16Bit, bool bUseBOM, bool bBigEndian); - virtual ~s_Text_Listener() {}; + virtual ~s_Text_Listener() {} virtual bool populate(PL_StruxFmtHandle sfh, const PX_ChangeRecord * pcr); @@ -81,17 +82,21 @@ virtual bool signal(UT_uint32 iSignal); protected: - void _closeBlock(void); + void _genBOM(void); + void _genLineBreak(void); void _outputData(const UT_UCSChar * p, UT_uint32 length); - void _output8BitData(const UT_UCSChar * , UT_uint32 length); - void _output16BitData(const UT_UCSChar * , UT_uint32 length); - + void _closeBlock(void); + PD_Document * m_pDocument; IE_Exp_Text * m_pie; + UT_Wctomb m_wctomb; + char m_mbBOM[MY_MB_LEN_MAX]; + int m_iBOMLen; + char m_mbLineBreak[MY_MB_LEN_MAX*2]; + int m_iLineBreakLen; bool m_bInBlock; bool m_bToClipboard; bool m_bFirstWrite; - UT_Wctomb m_wctomb; const char * m_szEncoding; bool m_bIs16Bit; bool m_bBigEndian; @@ -102,29 +107,21 @@ /*****************************************************************/ IE_Exp_Text::IE_Exp_Text(PD_Document * pDocument, bool bEncoded) - : IE_Exp(pDocument) + : IE_Exp(pDocument), + m_pListener(NULL), + m_bIsEncoded(bEncoded) { UT_ASSERT(pDocument); + m_error = 0; + const char *szEncodingName = pDocument->getEncodingName(); if (!szEncodingName || !*szEncodingName) szEncodingName = XAP_EncodingManager::get_instance()->getNativeEncodingName(); - m_error = 0; - m_pListener = NULL; - m_bIsEncoded = bEncoded; - - // TODO Use persistent document encoding when it exists _setEncoding(szEncodingName); } -/*! - Destruct text exporter - */ -IE_Exp_Text::~IE_Exp_Text() -{ -} - /*****************************************************************/ /*****************************************************************/ @@ -198,6 +195,17 @@ UT_Error IE_Exp_Text::_writeDocument(void) { + // TODO If we're going to the clipboard and the OS supports unicode, set encoding. + // TODO Only supports Windows so far. + // TODO Should use a finer-grain technique than IsWinNT() since Win98 supports unicode clipboard. + if (m_pDocRange) + { +#ifdef WIN32 + if (UT_IsWinNT()) + _setEncoding(XAP_EncodingManager::get_instance()->getNativeUnicodeEncodingName()); +#endif + } + m_pListener = new s_Text_Listener(m_pDocument,this,(m_pDocRange!=NULL),m_szEncoding,m_bIs16Bit,m_bUseBOM,m_bBigEndian); if (!m_pListener) return UT_IE_NOMEMORY; @@ -226,6 +234,7 @@ /*! Request file encoding from user + \param szEncoding Encoding to export file into This function should be identical to the one in ie_Imp_Text */ @@ -234,10 +243,10 @@ XAP_Dialog_Id id = XAP_DIALOG_ID_ENCODING; XAP_DialogFactory * pDialogFactory - = (XAP_DialogFactory *)(m_pDocument->getApp()->getDialogFactory()); + = reinterpret_cast<XAP_DialogFactory *>(m_pDocument->getApp()->getDialogFactory()); XAP_Dialog_Encoding * pDialog - = (XAP_Dialog_Encoding *)(pDialogFactory->requestDialog(id)); + = reinterpret_cast<XAP_Dialog_Encoding *>(pDialogFactory->requestDialog(id)); UT_ASSERT(pDialog); pDialog->setEncoding(szEncoding); @@ -261,7 +270,7 @@ UT_ASSERT (s); strcpy(szEnc,s); - _setEncoding((const char *)szEnc); + _setEncoding(reinterpret_cast<const char *>(szEnc)); m_pDocument->setEncodingName(szEnc); } @@ -270,214 +279,191 @@ return bOK; } -/*****************************************************************/ -/*****************************************************************/ +/*! + Set exporter's encoding and related members + \param szEncoding Encoding to export file into -void s_Text_Listener::_closeBlock(void) + Decides endian and BOM policy based on encoding + */ +void IE_Exp_Text::_setEncoding(const char *szEncoding) { - if (!m_bInBlock) - return; + UT_ASSERT(szEncoding); - UT_UCSChar wcLineBreak[3] = {0,0,0}; - char mbLineBreak[MY_MB_LEN_MAX*2]; - UT_UCSChar *pWC = wcLineBreak; - char *pMB = mbLineBreak; - int mbLen; + m_szEncoding = szEncoding; - // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. + // TODO some iconvs use a different string! + if (!strncmp(m_szEncoding,"UCS-2",5)) + { + m_bIs16Bit = true; + if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "BE")) + m_bBigEndian = true; + else if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "LE")) + m_bBigEndian = false; + else + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + + // TODO Should BOM use be a user pref? + // TODO Does Mac OSX prefer BOMs? #ifdef WIN32 - wcLineBreak[0] = '\r'; - wcLineBreak[1] = '\n'; + m_bUseBOM = true; #else - wcLineBreak[0] = '\n'; + m_bUseBOM = false; #endif - - while (*pWC) + } + else { - if (!m_wctomb.wctomb(pMB,mbLen,(wchar_t)*pWC)) - UT_ASSERT(UT_SHOULD_NOT_HAPPEN); - ++pWC; - pMB += mbLen; + m_bIs16Bit = false; + // These are currently meaningless when not in a Unicode encoding + m_bBigEndian = false; + m_bUseBOM = false; } - - m_pie->write(mbLineBreak,pMB-mbLineBreak); - m_bInBlock = false; - return; } +/*****************************************************************/ +/*****************************************************************/ + /*! - Output text buffer to file - \param data Buffer to output - \param length Size of buffer + Generate correct BOM - Currently calls different functions for 8 bit and 16 bit encodings due to use - of dynamically sized string classes which only support one character size. - If we support UCS-4/UTF-32 in the future we'll need a third function. + Makes a Byte Order Mark correct for the encoding. */ -void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length) +void s_Text_Listener::_genBOM(void) { - if (m_bFirstWrite) - { - UT_ASSERT(m_szEncoding); - m_wctomb.setOutCharset(m_szEncoding); + // TODO iconv (at least libiconv) actually converts BOM to nothing at all ): +#if 0 + UT_UCSChar wcBOM[2] = {0,0}; + UT_UCSChar *pWC = wcBOM; + char *pMB = reinterpret_cast<char *>(m_mbBOM); + int mbLen; - // TODO BOMs need separate code for UTF-7, UCS-4, etc - if (m_bUseBOM) + wcBOM[0] = UCS_BOM; + + while (*pWC) + { + if (!m_wctomb.wctomb(pMB,mbLen,static_cast<wchar_t>(*pWC))) + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + ++pWC; + pMB += mbLen; + } + m_iBOMLen = pMB - m_mbBOM; +#else + // Hard-coded BOM values + if (m_bIs16Bit) + { + // This code should cover UCS-2 and UTF-16, both endians + if (m_bBigEndian) { - if (m_bIs16Bit) - { - if (m_bBigEndian) - m_pie->write("\xfe\xff",2); - else - m_pie->write("\xff\xfe",2); - } - else - { - // TODO There may be reason for using a BOM in UTF-8 text. - // TODO I've seen MS software do it. - m_pie->write("\xef\xbb\xbf",3); - } + strcpy(m_mbBOM,"\xfe\xff"); + m_iBOMLen = 2; } - m_bFirstWrite = false; + else + { + strcpy(m_mbBOM,"\xff\xfe"); + m_iBOMLen = 2; + } } - - // TODO some iconvs use a different string! - if (m_bIs16Bit) - _output16BitData(data, length); else - _output8BitData(data, length); + { + // This code covers UTF-8 only + strcpy(m_mbBOM,"\xef\xbb\xbf"); + m_iBOMLen = 3; + } + // TODO UTF-7, UCS-4, UTF-32 +#endif } /*! - Output 8-bit text buffer to file - \param data Buffer to output - \param length Size of buffer + Generate correct line break characters - Single byte and multi byte encodings are supported. - Wide character encodings are not supported. - The buffer must not contain NULL bytes. + Makes a line break correct for the encoding and platform. */ -void s_Text_Listener::_output8BitData(const UT_UCSChar * data, UT_uint32 length) +void s_Text_Listener::_genLineBreak(void) { - UT_String sBuf; - const UT_UCSChar * pData; - + UT_UCSChar wcLineBreak[3] = {0,0,0}; + UT_UCSChar *pWC = wcLineBreak; + char *pMB = reinterpret_cast<char *>(m_mbLineBreak); int mbLen; - char pC[MY_MB_LEN_MAX]; - // TODO Do we still need this? - UT_ASSERT(sizeof(UT_Byte) == sizeof(char)); - - for (pData=data; (pData<data+length); /**/) - { - if(!m_wctomb.wctomb(pC,mbLen,(wchar_t)*pData)) - { - mbLen=1; - pC[0]='?'; - m_wctomb.initialize(); - } - if (mbLen>1) - { - pC[mbLen]='\0'; - sBuf += pC; - } - else - { - // We let any UCS_LF's (forced line breaks) go out as is. - // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. + // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. #ifdef WIN32 - if (pC[0]==UCS_LF) - sBuf += '\r'; + wcLineBreak[0] = '\r'; + wcLineBreak[1] = '\n'; +#else + wcLineBreak[0] = '\n'; #endif - sBuf += (char)pC[0]; - } - pData++; - } - m_pie->write(sBuf.c_str(),sBuf.size()); + while (*pWC) + { + if (!m_wctomb.wctomb(pMB,mbLen,static_cast<wchar_t>(*pWC))) + UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + ++pWC; + pMB += mbLen; + } + m_iLineBreakLen = pMB - m_mbLineBreak; } /*! - Output 16-bit text buffer to file + Output text buffer to stream \param data Buffer to output \param length Size of buffer - - Supports the UCS-2 encodings. UCS-2 streams include NULL bytes. */ -void s_Text_Listener::_output16BitData(const UT_UCSChar * data, UT_uint32 length) +void s_Text_Listener::_outputData(const UT_UCSChar * data, UT_uint32 length) { - UT_UCS2String sBuf; + UT_ByteBuf bBuf; const UT_UCSChar * pData; - + int mbLen; - UT_UCSChar c; + char pC[MY_MB_LEN_MAX]; + + if (m_bFirstWrite) + { + UT_ASSERT(m_szEncoding); + m_wctomb.setOutCharset(m_szEncoding); - UT_ASSERT(sizeof(UT_Byte) == sizeof(char)); + if (m_bUseBOM) + _genBOM(); + _genLineBreak(); + + // TODO BOMs need separate code for UTF-7, UCS-4, etc + if (m_bUseBOM) + m_pie->write(reinterpret_cast<const char *>(m_mbBOM),m_iBOMLen); + + m_bFirstWrite = false; + } - for (pData=data; (pData<data+length); /**/) + for (pData=data; (pData<data+length); ++pData) { - if(!m_wctomb.wctomb(reinterpret_cast<char *>(&c),mbLen,(wchar_t)*pData)) + // We let any UCS_LF's (forced line breaks) go out as is. + if (*pData==UCS_LF) + bBuf.append(reinterpret_cast<UT_Byte *>(m_mbLineBreak),m_iLineBreakLen); + else { - mbLen=2; - // TODO U+FFFD "REPLACEMENT CHARACTER" is the - // TODO correct unicode equivalent of '?' isn't it? - // TODO Or is it U+25A0 "BLACK SQUARE"? - c=0xFFFD; - m_wctomb.initialize(); + if (!m_wctomb.wctomb(pC,mbLen,static_cast<wchar_t>(*pData))) + { + mbLen=1; + pC[0]='?'; + m_wctomb.initialize(); + } + UT_ASSERT(mbLen>=1); + bBuf.append(reinterpret_cast<const UT_Byte *>(pC),mbLen); } - UT_ASSERT(mbLen==2); - // We let any UCS_LF's (forced line breaks) go out as is. - // TODO Old Mac should use "\r". Mac OSX should Use U+2028 or U+2029. -#ifdef WIN32 - if (c==UCS_LF) - sBuf += '\r'; -#endif - sBuf += c; - - pData++; } - m_pie->write(reinterpret_cast<const char *>(sBuf.ucs_str()),sBuf.size()*sizeof(UT_UCSChar)); + m_pie->write(reinterpret_cast<const char *>(bBuf.getPointer(0)),bBuf.getLength()); } - -/*! - Set exporter's encoding and related members - \param szEncoding Encoding to export file into - Decides endian and BOM policy based on encoding - */ -void IE_Exp_Text::_setEncoding(const char *szEncoding) +void s_Text_Listener::_closeBlock(void) { - UT_ASSERT(szEncoding); + if (!m_bInBlock) + return; - m_szEncoding = szEncoding; + UT_ASSERT(!m_bFirstWrite); + UT_ASSERT(m_iLineBreakLen); - // TODO some iconvs use a different string! - if (!strncmp(m_szEncoding,"UCS-2",5)) - { - m_bIs16Bit = true; - if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "BE")) - m_bBigEndian = true; - else if (!strcmp(m_szEncoding + strlen(m_szEncoding) - 2, "LE")) - m_bBigEndian = false; - else - UT_ASSERT(UT_SHOULD_NOT_HAPPEN); + m_pie->write(reinterpret_cast<const char *>(m_mbLineBreak),m_iLineBreakLen); - // TODO Should BOM use be a user pref? - // TODO Does Mac OSX prefer BOMs? -#ifdef WIN32 - m_bUseBOM = true; -#else - m_bUseBOM = false; -#endif - } - else - { - m_bIs16Bit = false; - // These are currently meaningless when not in a Unicode encoding - m_bBigEndian = false; - m_bUseBOM = false; - } + m_bInBlock = false; + return; } s_Text_Listener::s_Text_Listener(PD_Document * pDocument, @@ -487,19 +473,19 @@ bool bIs16Bit, bool bUseBOM, bool bBigEndian) + : m_pDocument(pDocument), + m_pie(pie), + m_bToClipboard(bToClipboard), + // when we are going to the clipboard, we should implicitly + // assume that we are starting in the middle of a block. + // when going to a file we should not. + m_bInBlock(bToClipboard), + m_bFirstWrite(true), + m_szEncoding(szEncoding), + m_bIs16Bit(bIs16Bit), + m_bUseBOM(bToClipboard ? false : bUseBOM), + m_bBigEndian(bBigEndian) { - m_pDocument = pDocument; - m_pie = pie; - m_bToClipboard = bToClipboard; - // when we are going to the clipboard, we should implicitly - // assume that we are starting in the middle of a block. - // when going to a file we should not. - m_bInBlock = m_bToClipboard; - m_bFirstWrite = true; - m_szEncoding = szEncoding; - m_bIs16Bit = bIs16Bit; - m_bUseBOM = bToClipboard ? false : bUseBOM; - m_bBigEndian = bBigEndian; } bool s_Text_Listener::populate(PL_StruxFmtHandle /*sfh*/, @@ -509,7 +495,7 @@ { case PX_ChangeRecord::PXT_InsertSpan: { - const PX_ChangeRecord_Span * pcrs = static_cast<const PX_ChangeRecord_Span *> (pcr); + const PX_ChangeRecord_Span * pcrs = static_cast<const PX_ChangeRecord_Span *>(pcr); PT_BufIndex bi = pcrs->getBufIndex(); _outputData(m_pDocument->getPointer(bi),pcrs->getLength()); @@ -521,8 +507,8 @@ { #if 1 // TODO decide how to indicate objects in text output. - - const PX_ChangeRecord_Object * pcro = static_cast<const PX_ChangeRecord_Object *> (pcr); + + const PX_ChangeRecord_Object * pcro = static_cast<const PX_ChangeRecord_Object *>(pcr); //PT_AttrPropIndex api = pcr->getIndexAP(); fd_Field* field; switch (pcro->getObjectType()) @@ -540,7 +526,7 @@ // if(field->getValue() != NULL) m_pie->write(field->getValue()); - + return true; default: @@ -566,7 +552,7 @@ PL_StruxFmtHandle * psfh) { UT_ASSERT(pcr->getType() == PX_ChangeRecord::PXT_InsertStrux); - const PX_ChangeRecord_Strux * pcrx = static_cast<const PX_ChangeRecord_Strux *> (pcr); + const PX_ChangeRecord_Strux * pcrx = static_cast<const PX_ChangeRecord_Strux *>(pcr); *psfh = 0; // we don't need it. switch (pcrx->getStruxType()) Index: src/wp/impexp/xp/ie_exp_Text.h =================================================================== RCS file: /cvsroot/abi/src/wp/impexp/xp/ie_exp_Text.h,v retrieving revision 1.9 diff -u -r1.9 ie_exp_Text.h --- src/wp/impexp/xp/ie_exp_Text.h 2001/06/07 15:52:42 1.9 +++ src/wp/impexp/xp/ie_exp_Text.h 2001/06/15 06:50:54 @@ -1,19 +1,19 @@ /* AbiWord * Copyright (C) 1998 AbiSource, Inc. - * + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ @@ -66,20 +66,20 @@ { public: IE_Exp_Text(PD_Document * pDocument, bool bEncoded=false); - virtual ~IE_Exp_Text(); + virtual ~IE_Exp_Text() {} protected: virtual UT_Error _writeDocument(void); virtual bool _openFile(const char * szFilename); bool _doEncodingDialog(const char *szEncoding); void _setEncoding(const char *szEncoding); - + s_Text_Listener * m_pListener; bool m_bIsEncoded; const char * m_szEncoding; bool m_bIs16Bit; - bool m_bUseBOM; bool m_bBigEndian; + bool m_bUseBOM; }; #endif /* IE_EXP_TEXT_H */
_________________________________________________________ Do You Yahoo!? Get your free @yahoo.com address at http://mail.yahoo.com
This archive was generated by hypermail 2b25 : Fri Jun 15 2001 - 02:08:49 CDT