Re: CJK line breaking

From: Tomas Frydrych <tomasfrydrych_at_yahoo.co.uk>
Date: Mon Mar 07 2005 - 23:11:06 CET

Hi Roland,

Many thanks for your work on this; I have not had a chance to test the
patch; I think it is OK in principle but I have a couple of formal comments:

1) I would be better if the CJK breaking logic could be inside the
encoding manager rather than the GR_Graphics::canBreak(). The problem is
that the virtual GR_Graphics::canBreak()is overwritten in some of the
platform specific graphics classes and we might need to make the same
kind of decisions in those functions as well, depending on how well each
platform handles CJK. It would be nice if we could do that with minimal
code duplication.

2) Please use UT_return_val_if_fail() instead of UT_ASSERT() on
pointers; this prevents AW from crashing due to dereferencing NULL in
the non-debug build (the UT_ASSERT macro is supposed to be deprecated;
we have a UT_ASSERT_HARMLESS() to indicate clearly that while something
is desirable, failure is not catastrophic).

3) Please use UT_DEBUGMSG(()) instead of printf to print debug info.

4) If you would follow the naming conventions described in
abi/docs/AbiSourceCodeGuidelines.abw (I realise that the Encoding
manager is not a great example in this regard).

I am not sure what is causing the asserts you mention, but perhaps the
iterators should have the upper limit set to the end of block rather
than not at all as I initially suggested. I hope to have a closer look
at this later this week.

Tomas

Roland Kay wrote:
> Sorry. I forgot to attached the files. The attachments to this email are
> CJK.patch (the patch), TextMessages.abw (a document that causes the
> asserts).
>
> R.
>
>
> ------------------------------------------------------------------------
>
> diff -uNr abi-old/src/af/gr/xp/gr_Graphics.cpp abi/src/af/gr/xp/gr_Graphics.cpp
> --- abi-old/src/af/gr/xp/gr_Graphics.cpp 2005-03-05 05:30:33.000000000 +0000
> +++ abi/src/af/gr/xp/gr_Graphics.cpp 2005-03-05 08:09:22.000000000 +0000
> @@ -1095,23 +1095,77 @@
> }
>
> /*!
> - return true if linebreak at character c is permissible
> - the built-in class is too simple to differentiate between breaks before and after character
> + * return true if linebreak after/before character c is permissible.
> + *
> + * When editing Chinese documents certain characters cannot be allowed
> + * to start a line (eg. ".,)>>}") and certain others cannot end a line
> + * (eg. "(<<{"). Also, a few characters like "--" must occur in pairs
> + * which cannot be split. This function returns true or false in such a
> + * way as to ensure these rules. The characters affected are defined
> + * in the table in xap_EncodingManager.cpp
> */
> -bool GR_Graphics::canBreak(GR_RenderInfo & ri, UT_sint32 &iNext, bool /* bAfter */)
> +bool GR_Graphics::canBreak(GR_RenderInfo & ri, UT_sint32 &iNext, bool bAfter)
> {
> + UT_return_val_if_fail(getApp(), false);
> + const class XAP_EncodingManager *EncMan=getApp()->getEncodingManager();
> + struct cjk_prop_ cjkprop[3];
> + UT_UCS4Char c[3];
> + int i;
> +
> + UT_ASSERT(EncMan);
> +
> iNext = -1; // we do not bother with this
> UT_return_val_if_fail(ri.m_pText && ri.m_pText->getStatus() == UTIter_OK, false);
>
> *(ri.m_pText) += ri.m_iOffset;
> UT_return_val_if_fail(ri.m_pText->getStatus() == UTIter_OK, false);
>
> - UT_UCS4Char c = ri.m_pText->getChar();
> -
> - UT_return_val_if_fail(getApp(), false);
> - return getApp()->getEncodingManager()->can_break_at(c);
> + /*
> + * For CJK we need to consider the characters either side as well.
> + */
> + c[1] = ri.m_pText->getChar();
> + --(*ri.m_pText);
> + c[0] = ri.m_pText->getChar();
> + ++(*ri.m_pText); ++(*ri.m_pText);
> + c[2] = ri.m_pText->getChar();
> +
> + // Get the CJK properties of the three chars.
> + for (i=0; i<3; i++)
> + cjkprop[i] = EncMan->char_cjk_prop(c[i]);
> +
> +#if 1
> + // This is just for debugging
> + printf("KAY: Break %s ", bAfter?"after":"before");
> + for (i=0; i<3; i++) {
> + printf("c%d: %x(%s%s%s%s) ", i, c[i],
> + cjkprop[i].cjk?"C":".",
> + cjkprop[i].cant_start_line?"S":".",
> + cjkprop[i].cant_end_line?"E":".",
> + cjkprop[i].must_not_split?"N":".");
> + }
> + printf("\n");
> +#endif
> +
> + if (bAfter) // We're being asked about breaking after.
> + {
> + if (cjkprop[1].must_not_split && c[1]==c[2])
> + return false;
> + if (cjkprop[1].cant_end_line || !cjkprop[1].cjk)
> + return EncMan->can_break_at(c[1], bAfter);
> + return !cjkprop[2].cant_start_line;
> + }
> + else // We're being asked about breaking before.
> + {
> + if (cjkprop[1].must_not_split && c[1]==c[0])
> + return false;
> + if (cjkprop[1].cant_start_line || !cjkprop[1].cjk)
> + return EncMan->can_break_at(c[1], bAfter);
> + return !cjkprop[1].cant_end_line;
> + }
> + UT_ASSERT(UT_SHOULD_NOT_HAPPEN);
> }
>
> +
> /*!
> resetJustification() makes the data represented by ri unjustified
> and returns value by which the total width changed as a result such
> diff -uNr abi-old/src/af/xap/xp/xap_EncodingManager.cpp abi/src/af/xap/xp/xap_EncodingManager.cpp
> --- abi-old/src/af/xap/xp/xap_EncodingManager.cpp 2005-03-05 05:30:41.000000000 +0000
> +++ abi/src/af/xap/xp/xap_EncodingManager.cpp 2005-03-05 08:50:29.000000000 +0000
> @@ -646,6 +646,7 @@
> };
>
>
> +#if 0 // This function is made obsolete by rules in gr_Graphics::canBreak()
> /*
> TODO I'm pretty sure you can't break Korean at any character.
> TODO And what about Japanese Katakana and Hiragana?
> @@ -656,6 +657,109 @@
> {"1",cjk_languages},
> {NULL}
> };
> +#endif
> +
> +/*
> + *
> + * A list of characters that cannot appear at the beginning
> + * of a line. These must be in NUMERICAL ORDER.
> + *
> + * This is mainly for CJK languages in which we can break between
> + * any character.
> + */
> +static const UT_UCS4Char cjk_cant_start_line[]=
> +{
> + 0x0021, // ASCII eclamation mark.
> + 0x0029, // ASCII right parenthesis
> + 0x002e, // ASCII full stop.
> + 0x002c, // ASCII colon.
> + 0x003b, // ASCII semicolon.
> + 0x003e, // ASCII greater-than sign.
> + 0x003f, // ASCII question mark.
> + 0x2019, // Right single quotation mark. (fullwidth?)
> + 0x201d, // Right double quotation marks. (fullwidth?)
> + 0x3001, // Ideographic comma.
> + 0x3002, // Ideographic full stop.
> + 0x300b, // Right double angle bracket.
> + 0xff01, // Fullwidth exclamation mark.
> + 0xff09, // Fullwidth right parenthesis.
> + 0xff0c, // Full width comma.
> + 0xff1a, // Fullwidth colon.
> + 0xff1b, // Fullwidth semicolon.
> + 0xfe1e, // Fullwidth greater-than sign.
> + 0xff1f, // Fullwidth question mark.
> + 0xff3d, // Fullwidth right square bracket.
> + 0xff5d, // Fullwidth right curly bracket.
> + 0x0
> +};
> +
> +/* These must also be in NUMERICAL ORDER. */
> +static const UT_UCS4Char cjk_cant_end_line[]=
> +{
> + 0x0028, // ASCII left parenthesis.
> + 0x003c, // ASCII less-than sign.
> + 0x201c, // Left double quotation mark.
> + 0x300a, // Left double angle bracket.
> + 0xff08, // Fullwidth left parenthesis.
> + 0xff1c, // Fullwifth less-than sign.
> + 0xff3b, // Fullwidth left square bracket.
> + 0xff5b, // Fullwidth left curly bracket.
> + 0x0
> +};
> +
> +/*
> + * A list of characters that, when in pairs, must not be split.
> + * These must also be in NUMERICAL ORDER.
> + */
> +static const UT_UCS4Char cjk_must_not_split[]=
> +{
> + 0x2014, // EM Dash.
> + 0x2036 // Horizontal ellipsis.
> +};
> +
> +/*
> + * Returns true if "c" is in "list".
> + */
> +static bool is_in_list(UT_UCS4Char c, const UT_UCS4Char *list)
> +{
> + int i=0;
> +
> + while ((c >= list[i]) && (list[i] != 0)) {
> + if (list[i]==c)
> + return true;
> + i++;
> + }
> + return false;
> +}
> +
> +/*
> + * Given a characte rreturns a value that indicates if it is a
> + * CJK character and if so is it forbidden from the beginning or
> + * end of a line.
> + */
> +struct cjk_prop_ XAP_EncodingManager::char_cjk_prop(UT_UCS4Char c) const
> +{
> + struct cjk_prop_ prop = {false, false, false};
> +
> + // Does this fall into any of the ranges for CJK characters?
> + if ((c>=0x20000 && c<=0x2a6df) || (c>=0x2f800 && c<=0x2fa1f) ||
> + (c>=0x3000 && c<=0x303f) || (c>=0x3200 && c<=0x32ff) ||
> + (c>=0x3300 && c<=0x33ff) || (c>=0x3400 && c<=0x4dbf) ||
> + (c>=0x4e00 && c<=0x9faf) || (c>=0xf900 && c<=0xfaff) ||
> + (c>=0xfe30 && c<=0xfe4f))
> + prop.cjk = true;
> +
> + if (is_in_list(c, cjk_cant_start_line))
> + prop.cant_start_line = true;
> +
> + if (is_in_list(c, cjk_cant_end_line))
> + prop.cant_end_line = true;
> +
> + if (is_in_list(c, cjk_must_not_split))
> + prop.must_not_split = true;
> +
> + return prop;
> +}
>
> /*
> This table is useful since some iconv implementations don't know some cpNNNN
> @@ -1111,8 +1215,11 @@
> {
> const char* str = search_rmap_with_opt_suffix(langcode_to_cjk,SEARCH_PARAMS);
> is_cjk_ = *str == '1';
> - str = search_rmap_with_opt_suffix(can_break_words_data,SEARCH_PARAMS);
> - can_break_words_ = *str == '1';
> +// This is made obsolete by new CJK line break handling in
> +// gr_Graphics::canBreak.
> +//
> +// str = search_rmap_with_opt_suffix(can_break_words_data,SEARCH_PARAMS);
> +// can_break_words_ = *str == '1';
> }
> {
> if (cjk_locale()) {
> @@ -1160,10 +1267,13 @@
>
> int XAP_EncodingManager__swap_stou,XAP_EncodingManager__swap_utos;
>
> +
> +#if 0 // This is made obsolete by new CJK line breaking in gr_Graphics::canBreak
> bool XAP_EncodingManager::can_break_words() const
> {
> return can_break_words_;
> }
> +#endif
>
> /*
> I'm not sure whether any non-cjk language doesn't make distinction
> @@ -1172,38 +1282,101 @@
> */
> bool XAP_EncodingManager::single_case() const { return cjk_locale(); }
>
> +#if 0 // This function is now performed by XAP_EncodingManager::char_cjk_prop
> bool XAP_EncodingManager::is_cjk_letter(UT_UCSChar c) const
> {
> if (!cjk_locale())
> return 0;
> return (c>0xff);
> }
> +#endif
>
> bool XAP_EncodingManager::noncjk_letters(const UT_UCSChar* str,int len) const
> {
> if (!cjk_locale())
> return 1;
> for(int i=0;i<len;++i) {
> - if (is_cjk_letter(str[i]))
> +// if (is_cjk_letter(str[i]))
> + if (char_cjk_prop(str[i]).cjk)
> return 0;
> };
> return 1;
> }
>
> +static UT_UCSChar can_break_after[] =
> +{
> + 0x0020, // UCS_SPACE
> + 0x0021, // !
> + 0x0029, // )
> + 0x002c, // ,
> + 0x002d, // UCS_MINUS
> + 0x002e, // .
> + 0x003a, // :
> + 0x003b, // ;
> + 0x003e, // >
> + 0x003f, // ?
> + 0x005d, // ]
> + 0x007d, // }
> + 0x2010, // UCS_HYPHEN
> + 0x2013, // UCS_EN_DASH
> + 0x2014, // UCS_EM_DASH
> + 0x3002, // Ideographic full stop
> + 0x300b, // Right double angle bracket
> + 0xff01, // Fullwidth exclamation mark
> + 0xff09, // Fullwidth right parenthesis
> + 0xff0c, // Fullwidth comma
> + 0xff0d, // Fullwidth hypen-minus
> + 0xff1a, // Fullwidth colon
> + 0xff1b, // Fullwidth semi-colon
> + 0xff1f, // Fullwidth question mark
> + 0xff3d, // Fullwidth right square bracket
> + 0xff5d, // Fullwidth right curly bracket
> + 0x0
> +};
> +
> +static UT_UCSChar can_break_before[] =
> +{
> + 0x0028, // (
> + 0x003c, // <
> + 0x005b, // [
> + 0x007b, // {
> + 0x300a, // Left double angle bracket
> + 0xff08, // Fullwidth left parenthesis
> + 0xff3b, // Fullwidth left square bracket
> + 0xff5b, // Fullwidth left curly bracket
> + 0x0
> +};
> +
> /*
> - This one correlates with can_break_words() very tightly.
> - Under CJK locales it returns 1 for cjk letters.
> - Under non-CJK locales returns 0.
> + * It appears that without the benefit of spaces we have to be much more
> + * consistent about whether a character can be broken before or after. So using
> + * the above tables we deal with punctuation more accurately. Eg, you can
> + * break after a comma but not before.
> */
> -bool XAP_EncodingManager::can_break_at(const UT_UCSChar c) const
> +bool XAP_EncodingManager::can_break_at(const UT_UCSChar c, bool bAfter) const
> {
> - if (c == UCS_SPACE
> + if (bAfter)
> + {
> + if (is_in_list(c, can_break_after))
> + return true;
> + }
> + else
> + {
> + if (is_in_list(c, can_break_before))
> + return true;
> + }
> + return false;
> +
> +/* if (c == UCS_SPACE
> || c == UCS_MINUS
> || c == UCS_HYPHEN
> || c == UCS_EN_DASH
> || c == UCS_EM_DASH)
> return 1;
> - return is_cjk_letter(c);
> +
> + return 0; */
> +// return is_cjk_letter(c); // CJK line breaking is now handled in
> +// GR_Graphics::canBreak().
> }
>
>
> @@ -1267,7 +1440,10 @@
> "--->8--------------\n"
>
> " WinLanguageCode is 0x%04x, WinCharsetCode is %d\n"
> - " cjk_locale %d, can_break_words %d, swap_utos %d, swap_stou %d\n",
> + " cjk_locale %d, "
> + // "can_break_words %d, " Made obsolete by new CJK line breaking in
> + // gr_Graphics:canBreak
> + "swap_utos %d, swap_stou %d\n",
> getLanguageISOName(), getLanguageISOTerritory() ? getLanguageISOTerritory() : "NULL",
> getNativeEncodingName(),getNativeSystemEncodingName(),
> getNative8BitEncodingName(),getNativeNonUnicodeEncodingName(),
> @@ -1275,7 +1451,7 @@
> fallbackChar(1072),
> getTexPrologue(),
> getWinLanguageCode(), getWinCharsetCode(),
> - int(cjk_locale()), int(can_break_words()),int(swap_utos),int(swap_stou)
> + int(cjk_locale()), /*int(can_break_words()),*/ int(swap_utos),int(swap_stou)
> ));
> UT_ASSERT( UT_iconv_isValid(iconv_handle_N2U) && UT_iconv_isValid(iconv_handle_U2N) );
> }
> diff -uNr abi-old/src/af/xap/xp/xap_EncodingManager.h abi/src/af/xap/xp/xap_EncodingManager.h
> --- abi-old/src/af/xap/xp/xap_EncodingManager.h 2005-03-05 05:30:41.000000000 +0000
> +++ abi/src/af/xap/xp/xap_EncodingManager.h 2005-03-05 08:08:03.000000000 +0000
> @@ -38,6 +38,17 @@
> #include "ut_iconv.h"
> #include "ut_xml.h"
>
> +/*
> + * CJK character properties struct.
> + */
> +struct cjk_prop_ {
> + bool cjk;
> + bool cant_end_line;
> + bool cant_start_line;
> + bool must_not_split;
> +};
> +
> +
> struct ABI_EXPORT XAP_LangInfo
> {
> /*no memeber can have NULL value. If string is empty, then value is
> @@ -169,7 +180,7 @@
> /* whether words can be broken at any character of the word (wide
> character, not byte). True for japanese.
> */
> - virtual bool can_break_words() const;
> +// virtual bool can_break_words() const;
>
> /*
> returns true if there is no distinction between upper and lower
> @@ -189,12 +200,18 @@
> Under CJK locales it returns 1 for cjk letters.
> Under non-CJK locales returns 0.
> */
> - virtual bool can_break_at(const UT_UCSChar c) const;
> + virtual bool can_break_at(const UT_UCSChar c, bool bAfter) const;
>
> /*
> This should be as precise as possible.
> */
> - virtual bool is_cjk_letter(UT_UCSChar c) const;
> +// virtual bool is_cjk_letter(UT_UCSChar c) const;
> +
> + /*
> + * Returns a value that indicates if c is a CJK character and, if so,
> + * what it's line breaking properties are.
> + */
> + virtual struct cjk_prop_ char_cjk_prop(UT_UCS4Char c) const;
>
> /*
> This is rather smart wrapper for wvLIDToCodePageConverter.
> diff -uNr abi-old/src/text/fmt/xp/fp_TextRun.cpp abi/src/text/fmt/xp/fp_TextRun.cpp
> --- abi-old/src/text/fmt/xp/fp_TextRun.cpp 2005-03-05 05:30:49.000000000 +0000
> +++ abi/src/text/fmt/xp/fp_TextRun.cpp 2005-03-05 06:23:11.000000000 +0000
> @@ -355,7 +355,7 @@
> PD_StruxIterator text(getBlock()->getStruxDocHandle(),
> getBlockOffset() + fl_BLOCK_STRUX_OFFSET);
> UT_return_val_if_fail(text.getStatus() == UTIter_OK, false);
> - text.setUpperLimit(text.getPosition() + getLength() - 1);
> + //text.setUpperLimit(text.getPosition() + getLength() - 1);
>
> UT_return_val_if_fail(m_pRenderInfo, false);
> m_pRenderInfo->m_pText = &text;
> @@ -389,7 +389,7 @@
> getBlockOffset() + fl_BLOCK_STRUX_OFFSET );
>
> UT_return_val_if_fail(text.getStatus() == UTIter_OK, false);
> - text.setUpperLimit(text.getPosition() + getLength() - 1);
> +// text.setUpperLimit(text.getPosition() + getLength() - 1);
>
> UT_return_val_if_fail(m_pRenderInfo, false);
> m_pRenderInfo->m_pText = &text;
> @@ -532,7 +532,7 @@
> offset + fl_BLOCK_STRUX_OFFSET);
>
> m_pRenderInfo->m_pText = &text;
> - text.setUpperLimit(text.getPosition() + getLength() - 1);
> +// text.setUpperLimit(text.getPosition() + getLength() - 1);
> UT_uint32 iPosStart = text.getPosition();
>
> //bool bReverse = (getVisDirection() == UT_BIDI_RTL);
> @@ -564,8 +564,8 @@
> text.setPosition(iPos);
> }
>
> - if (bForce || iNext == (UT_sint32)i || bCanBreak)
> - // && ((i + offset) != (getBlockOffset() + getLength() - 1))
> + if (bForce || iNext == (UT_sint32)i || bCanBreak
> + && ((i + offset) != (getBlockOffset() + getLength() - 1)))// KAY: Enabled
> {
> UT_sint32 ispace = 0;
> if(c == UCS_SPACE)
> diff -uNr abi-old/src/text/fmt/xp/fv_View.cpp abi/src/text/fmt/xp/fv_View.cpp
> --- abi-old/src/text/fmt/xp/fv_View.cpp 2005-03-05 05:30:49.000000000 +0000
> +++ abi/src/text/fmt/xp/fv_View.cpp 2005-03-05 06:23:11.000000000 +0000
> @@ -9856,8 +9856,10 @@
> // CJK-FIXME: this can work incorrectly under CJK locales
> // since it can give 'true' for UCS with value >0xff (like
> // quotes, etc).
> + // New function might be better. - R.Kay
> if (newWord ||
> - XAP_EncodingManager::get_instance()->is_cjk_letter(pSpan[i]))
> +// XAP_EncodingManager::get_instance()->is_cjk_letter(pSpan[i]))
> + (XAP_EncodingManager::get_instance()->char_cjk_prop(pSpan[i]).cjk))
> {
>
> wCount.word++;
Received on Mon Mar 7 23:20:45 2005

This archive was generated by hypermail 2.1.8 : Mon Mar 07 2005 - 23:20:46 CET