#include /* Conversion routines between Nagari (biased to Gujarati) letters and printed characters. The characters 0000-007f are Unicode minus an offset that depends on the particular Nagari script. By Pierre Abbat. Preliminary version 2000-02-13 */ #define NAGCHAR unsigned short #define CANDRABINDU 0x01 #define ANUSVARA 0x02 #define VISARGA 0x03 #define A 0x05 #define AA 0x06 #define I 0x07 #define II 0x08 #define U 0x09 #define UU 0x0a #define RI 0x0b #define LI 0x0c #define E 0x0f #define AI 0x10 #define O 0x13 #define AU 0x14 #define KA 0x15 #define KHA 0x16 #define GA 0x17 #define GHA 0x18 #define NGA 0x19 #define CA 0x1a #define CHA 0x1b #define JA 0x1c #define JHA 0x1d #define NYA 0x1e #define TTA 0x1f #define TTHA 0x20 #define DDA 0x21 #define DDHA 0x22 #define NNA 0x23 #define TA 0x24 #define THA 0x25 #define DA 0x26 #define DHA 0x27 #define NA 0x28 #define NNNA 0x29 #define PA 0x2a #define PHA 0x2b #define BA 0x2c #define BHA 0x2d #define MA 0x2e #define YA 0x2f #define RA 0x30 #define LA 0x32 #define LLA 0x33 #define VA 0x35 #define SHA 0x36 #define SSA 0x37 #define SA 0x38 #define HA 0x39 #define VAA 0x3e #define VI 0x3f #define VII 0x40 #define VU 0x41 #define VUU 0x42 #define VR 0x43 #define VRR 0x44 #define VE 0x47 #define VAI 0x48 #define VO 0x4b #define VAU 0x4c #define VIRAMA 0x4d #define RRI 0x60 #define VL 0x62 #define BLANK 0x80 #define JAA 0x81 #define JII 0x82 #define JO 0x83 #define JAU 0x84 #define RUU 0x85 #define K 0x95 #define KH 0x96 #define G 0x97 #define GH 0x98 #define NG 0x99 #define C 0x9 #define CH 0x9b #define J 0x9c #define JH 0x9d #define NY 0x9e #define TT 0x9f #define TTH 0xa0 #define DD 0xa1 #define DDH 0xa2 #define NN 0xa3 #define T 0xa4 #define TH 0xa5 #define D 0xa6 #define DH 0xa7 #define N 0xa8 #define P 0xa #define PH 0xab #define B 0xac #define BH 0xad #define M 0xae #define Y 0xaf #define R 0xb0 #define L 0xb2 #define LL 0xb3 #define V 0xb5 #define SH 0xb6 #define SS 0xb7 #define S 0xb8 #define H 0xb9 /* Assignment of code points to ligatures is based on the first letter: 100-11f velar 9 120-12f palatal 4 130-13f retroflex 4 140-15f dental 10 160-16b bilabial 6 16c-16f semivowel 2 170-16f fricative 8 <100 with vowels The numbers 100-17f are for ligatures including a vowel (including Gujarati JI); 180-1ff are for ligatures with no vowel (some of which are unprintable). The number 2 is used to denote doubling, as retroflex consonants are denoted by symbols such as DD. */ #define K2A 0x100 #define KTA 0x101 #define KRA 0x102 #define KSSA 0x103 #define NGKA 0x104 #define NGKHA 0x105 #define NGGA 0x106 #define NGGHA 0x107 #define NGNGA 0x108 #define NGMA 0x109 #define CHRA 0x120 #define JNYA 0x121 #define TT2A 0x130 #define TTRA 0x131 #define TTHRA 0x132 #define DDRA 0x133 #define DDHRA 0x134 #define T2A 0x140 #define TRA 0x141 #define DGA 0x142 #define D2A 0x143 #define D2HA 0x144 #define DBA 0x145 #define DRA 0x146 #define DVA 0x147 #define PHRA 0x160 #define SHVA 0x170 #define SSTTA 0x171 #define SSTTHA 0x172 #define STRA 0x173 #define K2 0x180 #define KT 0x181 #define KR 0x182 #define KSS 0x183 #define NGK 0x184 #define NGKH 0x185 #define NGG 0x186 #define NGGH 0x187 #define NGNG 0x188 #define NGM 0x189 #define CHR 0x1a0 #define JNY 0x1a1 #define TT2 0x1b0 #define TTR 0x1b1 #define TTHR 0x1b2 #define DDR 0x1b3 #define DDHR 0x1b4 #define T2 0x1c0 #define TR 0x1c1 #define DG 0x1c2 #define D2 0x1c3 #define D2H 0x1c4 #define DB 0x1c5 #define DR 0x1c6 #define DV 0x1c7 #define PHR 0x1e0 #define SHV 0x1f0 #define SSTT 0x1f1 #define SSTTH 0x1f2 #define STR 0x1f3 /* Metacharacters: ff00-ff07 match characters with flags 0-7 set respectively. ff08-ff1f repeat previous. Character is stored in a register. ff20-ff3f same as ff00-ff1f xored with 80, used on rhs of rules. ff70-ff8f separate lhs from rhs. The cursor is moved n-ff80 positions. */ #define FSV0 0xff00 #define CHL0 0xff01 #define BAC0 0xff02 #define VCF0 0xff03 #define NCV0 0xff04 #define CV0 0xff05 #define FSV1 0xff08 #define CHL1 0xff09 #define BAC1 0xff0a #define VCF1 0xff0b #define NCV1 0xff0c #define CV1 0xff0d #define FSV2 0xff10 #define CHL2 0xff11 #define BAC2 0xff12 #define VCF2 0xff13 #define NCV2 0xff14 #define CV2 0xff15 #define FSV3 0xff18 #define CHL3 0xff19 #define BAC3 0xff1a #define VCF3 0xff1b #define NCV3 0xff1c #define CV3 0xff1d #define FSV0R 0xff20 #define CHL0R 0xff21 #define BAC0R 0xff22 #define VCF0R 0xff23 #define NCV0R 0xff24 #define CV0R 0xff25 #define FSV1R 0xff28 #define CHL1R 0xff29 #define BAC1R 0xff2a #define VCF1R 0xff2b #define NCV1R 0xff2c #define CV1R 0xff2d #define FSV2R 0xff30 #define CHL2R 0xff31 #define BAC2R 0xff32 #define VCF2R 0xff33 #define NCV2R 0xff34 #define CV2R 0xff35 #define FSV3R 0xff38 #define CHL3R 0xff39 #define BAC3R 0xff3a #define VCF3R 0xff3b #define NCV3R 0xff3c #define CV3R 0xff3d #define YIELDS 0xff80 #define ANY 0xffff /* Meanings of the character flags: 01 freestanding vowel or consonant-vowel or cluster-vowel combination. 02 consonant half-letter. 04 bare adandic consonant, which must be viramified. 08 vowel combining form or virama. 10 non-consonant-vowel, before which a bare consonant (even if dandic) must be viramified. 20 consonant-vowel */ #define FSV 1 #define CHL 2 #define BAC 4 #define VCF 8 #define NCV 16 #define CV 32 unsigned char charflags[]={ NCV, // null character ends string, so must viramify NCV,NCV,NCV, // candrabindu, anusvara,visarga NCV, // unassigned FSV,FSV,FSV,FSV, // A AA I II FSV,FSV,FSV,FSV, // U UU RI LI FSV,FSV,FSV,FSV, // ? ? E AI FSV,FSV,FSV,FSV, // O ? ? AU CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // KA KHA GA GHA NGA CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // CA CHA JA JHA NYA CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // TTA TTHA DDA DDHA NNA CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // TA THA DA DHA NA CV+FSV, // NNNA CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // PA PHA BA BHA MA CV+FSV,CV+FSV,CV+FSV,CV+FSV,CV+FSV, // YA RA RRA LA LLA CV+FSV,CV+FSV, // LLLA WA CV+FSV,CV+FSV,CV+FSV,CV+FSV, // SHA SSA SA HA NCV,NCV,NCV,NCV, // ? ? NUKTA AVAGRAHA VCF,VCF,VCF,VCF, // AA I II U VCF,VCF,VCF,VCF, // UU RI RRI ? VCF,VCF,VCF,VCF, // ? E AI ? VCF,VCF,VCF,VCF, // ? O AU VIRAMA NCV,NCV,NCV, // unassigned unassigned OM NCV,NCV,NCV,NCV, // accents NCV,NCV,NCV, // unassigned CV+FSV,CV+FSV,CV+FSV,CV+FSV, // KA KHA GA JA with dot CV+FSV,CV+FSV,CV+FSV,CV+FSV, // DDA DDHA PHA PA with dot FSV,FSV,VCF,VCF, // RRI LLI LI LLI NCV,NCV,NCV,NCV, // DANDA DVIDANDA 0 1 NCV,NCV,NCV,NCV, // 2 3 4 5 NCV,NCV,NCV,NCV, // 6 7 8 9 NCV,NCV,NCV,NCV, // abbreviation 3*unassigned NCV,NCV,NCV,NCV, // unassigned (in Bengali assigned to currency symbols) NCV,NCV,NCV,NCV, // unassigned NCV,NCV,NCV,NCV, // unassigned. Here ends the Unicode page. NCV,CV+FSV,CV+FSV,CV+FSV, // blank and special CV ligatures CV+FSV,CV+FSV,CV+FSV,CV+FSV, CV+FSV,CV+FSV,CV+FSV,CV+FSV, CV+FSV,CV+FSV,CV+FSV,CV+FSV, NCV,NCV,NCV,NCV,NCV, CHL,CHL,CHL,CHL,CHL+BAC, // K KH G GH NG CHL,CHL+BAC,CHL,CHL,CHL, // C CH J JH NY CHL+BAC,CHL+BAC,CHL+BAC,CHL+BAC,CHL, // TT TTH DD DDH NN CHL,CHL,CHL+BAC,CHL,CHL, // T TH D DH N CHL, // NNN CHL,CHL+BAC,CHL,CHL,CHL, // P PH B BH M CHL,CHL+BAC,CHL+BAC,CHL,CHL, // Y R RR L LL CHL,CHL, // LLL W CHL,CHL,CHL,CHL+BAC, // SH SS S H 0}; NAGCHAR rules[]={ CHL0,A,YIELDS,CHL0R,0, CHL0,AA,YIELDS,CHL0R,VAA,0, CHL0,I,YIELDS,CHL0R,VI,0, CHL0,II,YIELDS,CHL0R,VII,0, CHL0,U,YIELDS,CHL0R,VU,0, CHL0,UU,YIELDS,CHL0R,VUU,0, CHL0,RI,YIELDS,CHL0R,VR,0, CHL0,RRI,YIELDS,CHL0R,VRR,0, CHL0,E,YIELDS,CHL0R,VE,0, CHL0,AI,YIELDS,CHL0R,VAI,0, CHL0,O,YIELDS,CHL0R,VO,0, CHL0,AU,YIELDS,CHL0R,VAU,0, CHL0,NCV0,YIELDS,CHL0R,VIRAMA,NCV0,0, ANY,YIELDS+1,ANY,0, // this rule MUST be last; it advances the pointer 0}; NAGCHAR reg[64],any; int match1(NAGCHAR ch,NAGCHAR pat) /* Returns true if character ch matches pattern pat. If pat is a wildcard, the corresponding register in reg is set. */ {if (pat==ANY) {any=ch; return 1; } if (pat>=FSV0 && pat=YIELDS-16 && (x)=FSV0 && (x)