Patch: RecognizeContents for UTF-8


Subject: Patch: RecognizeContents for UTF-8
From: Andrew Dunbar (falconsquire@start.com.au)
Date: Mon Apr 09 2001 - 08:10:00 CDT


Here's my patch to allow loading UTF-8 files regardless
of the filename extension. Including the .txt which
is generally the case on Windows at least.

Andrew.

--- ie_imp_Text.cpp.orig Wed Feb 7 08:55:08 2001
+++ ie_imp_Text.cpp Mon Apr 9 23:08:12 2001
@@ -250,7 +250,10 @@
 
 bool IE_Imp_Text::RecognizeSuffix(const char * szSuffix)
 {
- return (UT_stricmp(szSuffix,".txt") == 0);
+ // TODO: We give the other guys a chance, since this
+ // TODO: importer is so generic. Does this seem
+ // TODO: like a sensible strategy?
+ return(false);
 }
 
 UT_Error IE_Imp_Text::StaticConstructor(PD_Document * pDocument,

--- ie_imp_UTF8.cpp.orig Wed Feb 7 08:55:08 2001
+++ ie_imp_UTF8.cpp Sun Apr 8 00:20:56 2001
@@ -308,8 +308,58 @@
 
 bool IE_Imp_UTF8::RecognizeContents(const char * szBuf, UT_uint32
iNumbytes)
 {
- // TODO: Not yet written
- return(false);
+ bool bSuccess = false;
+ const char *p = szBuf;
+
+ while (p < szBuf + iNumbytes)
+ {
+ int len;
+
+ if ((*p & 0x80) == 0) // ASCII
+ {
+ ++p;
+ continue;
+ }
+ else if (*p == 0xfe || *p == 0xff) // BOM markers? RFC2279 says
illegal
+ {
+ UT_DEBUGMSG((" BOM?\n"));
+ break;
+ }
+ else if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence
+ len = 6;
+ else if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence
+ len = 5;
+ else if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence
+ len = 4;
+ else if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence
+ len = 3;
+ else if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence
+ len = 2;
+ else // not UTF-8 lead byte
+ {
+ UT_DEBUGMSG((" not utf-8 lead byte\n"));
+ UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
+ return(false);
+ }
+
+ while (--len)
+ {
+ ++p;
+ if (p >= szBuf + iNumbytes)
+ {
+ UT_DEBUGMSG((" out of data!\n"));
+ //return(false);
+ break;
+ }
+ if ((*p & 0xc0) == 0x80)
+ bSuccess = true;
+ else
+ return(false);
+ }
+ ++p;
+ }
+
+ return(bSuccess);
 }
 
 bool IE_Imp_UTF8::RecognizeSuffix(const char * szSuffix)

__________________________________________________________________
Get your free Australian email account at http://www.start.com.au



This archive was generated by hypermail 2b25 : Mon Apr 09 2001 - 08:16:47 CDT