diff options
author | John Ankarström <john@ankarstrom.se> | 2021-01-28 21:33:30 +0000 |
---|---|---|
committer | John Ankarström <john@ankarstrom.se> | 2021-01-28 21:33:30 +0000 |
commit | 51b5b02f52cc6f314029f3f2fe97afdc26ba0f25 (patch) | |
tree | 711d1efadeb78ad9b2d9f7b629350358838bc626 /patch/troff-hyphenate-latin-1-fixed | |
parent | f3fd330cddade1c66d0f101d5cc6f657c4cd1bb6 (diff) | |
download | plan9-51b5b02f52cc6f314029f3f2fe97afdc26ba0f25.tar.gz |
Add various patches
Diffstat (limited to 'patch/troff-hyphenate-latin-1-fixed')
-rw-r--r-- | patch/troff-hyphenate-latin-1-fixed/email | 1 | ||||
-rw-r--r-- | patch/troff-hyphenate-latin-1-fixed/files | 1 | ||||
-rw-r--r-- | patch/troff-hyphenate-latin-1-fixed/n8.c | 584 | ||||
-rw-r--r-- | patch/troff-hyphenate-latin-1-fixed/n8.c.orig | 545 | ||||
-rw-r--r-- | patch/troff-hyphenate-latin-1-fixed/notes | 0 | ||||
-rw-r--r-- | patch/troff-hyphenate-latin-1-fixed/readme | 9 |
6 files changed, 1140 insertions, 0 deletions
diff --git a/patch/troff-hyphenate-latin-1-fixed/email b/patch/troff-hyphenate-latin-1-fixed/email new file mode 100644 index 0000000..191feb6 --- /dev/null +++ b/patch/troff-hyphenate-latin-1-fixed/email @@ -0,0 +1 @@ +john@ankarstrom.se diff --git a/patch/troff-hyphenate-latin-1-fixed/files b/patch/troff-hyphenate-latin-1-fixed/files new file mode 100644 index 0000000..48b7a8b --- /dev/null +++ b/patch/troff-hyphenate-latin-1-fixed/files @@ -0,0 +1 @@ +/sys/src/cmd/troff/n8.c n8.c diff --git a/patch/troff-hyphenate-latin-1-fixed/n8.c b/patch/troff-hyphenate-latin-1-fixed/n8.c new file mode 100644 index 0000000..b503fdf --- /dev/null +++ b/patch/troff-hyphenate-latin-1-fixed/n8.c @@ -0,0 +1,584 @@ +#include "tdef.h" +#include "fns.h" +#include "ext.h" +#include <assert.h> + +#define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */ + /* this value is used (as a literal) in suftab.c */ + /* to encode possible hyphenation points in suffixes. */ + /* it could be changed, by widening the tables */ + /* to be shorts instead of chars. */ + +/* + * troff8.c + * + * hyphenation + */ + +int hexsize = 0; /* hyphenation exception list size */ +char *hbufp = NULL; /* base of list */ +char *nexth = NULL; /* first free slot in list */ +Tchar *hyend; + +#define LATIN 256 +#define latcbits(i) ((i)+1 & 0x000FF) /* for some reason, extra chars */ + /* from latin-1 are off by one */ + +#define THRESH 160 /* digram goodness threshold */ +int thresh = THRESH; + +int texhyphen(void); +static int alpha(Tchar); + +void hyphen(Tchar *wp) +{ + int j; + Tchar *i; + + i = wp; + while (punct((*i++))) + ; + if (!alpha(*--i)) + return; + wdstart = i++; + while (alpha(*i++)) + ; + hyend = wdend = --i - 1; + while (punct((*i++))) + ; + if (*--i) + return; + if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */ + return; + hyp = hyptr; + *hyp = 0; + hyoff = 2; + + /* for now, try exceptions first, then tex (if hyphalg is non-zero), + then suffix and digram if tex didn't hyphenate it at all. + */ + + //if (!exword() && !texhyphen() && !suffix()) + if (!exword() && !texhyphen()) + digram(); + + /* this appears to sort hyphenation points into increasing order */ + *hyp++ = 0; + if (*hyptr) + for (j = 1; j; ) { + j = 0; + for (hyp = hyptr + 1; *hyp != 0; hyp++) { + if (*(hyp - 1) > *hyp) { + j++; + i = *hyp; + *hyp = *(hyp - 1); + *(hyp - 1) = i; + } + } + } +} + +static alpha(Tchar i) /* non-zero if really alphabetic */ +{ + if (ismot(i)) + return 0; + else if (latcbits(i) > LATIN) /* this isn't very elegant, but there's */ + return 0; /* no good way to make sure i is in range for */ + else if (latcbits(i) >= LATIN-64) { /* the call of isalpha */ + return (latcbits(i) != 0xD7 && latcbits(i) != 0xF7); + } else + return isalpha(cbits(i)); +} + + +punct(Tchar i) +{ + if (!i || alpha(i)) + return(0); + else + return(1); +} + + +void caseha(void) /* set hyphenation algorithm */ +{ + hyphalg = HYPHALG; + if (skip()) + return; + noscale++; + hyphalg = atoi0(); + noscale = 0; +} + + +void caseht(void) /* set hyphenation threshold; not in manual! */ +{ + thresh = THRESH; + if (skip()) + return; + noscale++; + thresh = atoi0(); + noscale = 0; +} + + +char *growh(char *where) +{ + char *new; + + hexsize += NHEX; + if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL) + return NULL; + if (new == hbufp) { + return where; + } else { + int diff; + diff = where - hbufp; + hbufp = new; + return new + diff; + } +} + + +void casehw(void) +{ + int i, k; + char *j; + Tchar t; + + if (nexth == NULL) { + if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) { + ERROR "No space for exception word list." WARN; + return; + } + hexsize = NHEX; + } + k = 0; + while (!skip()) { + if ((j = nexth) >= hbufp + hexsize - 2) + if ((j = nexth = growh(j)) == NULL) + goto full; + for (;;) { + if (ismot(t = getch())) + continue; + i = cbits(t); + if (i == ' ' || i == '\n') { + *j++ = 0; + nexth = j; + *j = 0; + if (i == ' ') + break; + else + return; + } + if (i == '-') { + k = HY_BIT; + continue; + } + *j++ = maplow(i) | k; + k = 0; + if (j >= hbufp + hexsize - 2) + if ((j = growh(j)) == NULL) + goto full; + } + } + return; +full: + ERROR "Cannot grow exception word list." WARN; + *nexth = 0; +} + + +int exword(void) +{ + Tchar *w; + char *e, *save; + + e = hbufp; + while (1) { + save = e; + if (e == NULL || *e == 0) + return(0); + w = wdstart; + while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) { + e++; + w++; + } + if (!*e) { + if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) { + w = wdstart; + for (e = save; *e; e++) { + if (*e & HY_BIT) + *hyp++ = w; + if (hyp > hyptr + NHYP - 1) + hyp = hyptr + NHYP - 1; + w++; + } + return(1); + } else { + e++; + continue; + } + } else + while (*e++) + ; + } +} + + +suffix(void) +{ + Tchar *w; + char *s, *s0; + Tchar i; + extern char *suftab[]; + +again: + i = cbits(*hyend); + if (!alpha(i)) + return(0); + if (i < 'a') + i -= 'A' - 'a'; + if ((s0 = suftab[i-'a']) == 0) + return(0); + for (;;) { + if ((i = *s0 & 017) == 0) + return(0); + s = s0 + i - 1; + w = hyend - 1; + while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) { + s--; + w--; + } + if (s == s0) + break; + s0 += i; + } + s = s0 + i - 1; + w = hyend; + if (*s0 & HY_BIT) + goto mark; + while (s > s0) { + w--; + if (*s-- & HY_BIT) { +mark: + hyend = w - 1; + if (*s0 & 0100) /* 0100 used in suftab to encode something too */ + continue; + if (!chkvow(w)) + return(0); + *hyp++ = w; + } + } + if (*s0 & 040) + return(0); + if (exword()) + return(1); + goto again; +} + + +maplow(int i) +{ + if (isupper(i)) + i = tolower(i); + else if ((latcbits(i) >= 0xC0 && latcbits(i) <= 0xD6) || (latcbits(i) >= 0xD8 && latcbits(i) <= 0xDD)) + i = tolower(latcbits(i)); + return(i); +} + + +vowel(int i) +{ + int j = latcbits(i); + if (j >= 0xC0 && j <= 0xC6 || /* uppercase */ + j >= 0xC8 && j <= 0xCF || + j >= 0xD2 && j <= 0xD6 || + j >= 0xD8 && j <= 0xDD || + j >= 0xE0 && j <= 0xE6 || /* lowercase */ + j >= 0xE8 && j <= 0xEF || + j >= 0xF2 && j <= 0xF6 || + j >= 0xF8 && j <= 0xFD || + j == 0xFF) + return 1; + + switch (i) { + case 'a': case 'A': + case 'e': case 'E': + case 'i': case 'I': + case 'o': case 'O': + case 'u': case 'U': + case 'y': case 'Y': + return(1); + default: + return(0); + } +} + + +Tchar *chkvow(Tchar *w) +{ + while (--w >= wdstart) + if (vowel(cbits(*w))) + return(w); + return(0); +} + + +void digram(void) +{ + Tchar *w; + int val; + Tchar *nhyend, *maxw; + int maxval; + extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13]; + +again: + if (!(w = chkvow(hyend + 1))) + return; + hyend = w; + if (!(w = chkvow(hyend))) + return; + nhyend = w; + maxval = 0; + w--; + while (++w < hyend && w < wdend - 1) { + val = 1; + if (w == wdstart) + val *= dilook('a', cbits(*w), bxh); + else if (w == wdstart + 1) + val *= dilook(cbits(*(w-1)), cbits(*w), bxxh); + else + val *= dilook(cbits(*(w-1)), cbits(*w), xxh); + val *= dilook(cbits(*w), cbits(*(w+1)), xhx); + val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx); + if (val > maxval) { + maxval = val; + maxw = w + 1; + } + } + hyend = nhyend; + if (maxval > thresh) + *hyp++ = maxw; + goto again; +} + + +dilook(int a, int b, char t[26][13]) +{ + int i, j; + + i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2]; + if (!(j & 01)) + i >>= 4; + return(i & 017); +} + + +/* here beginneth the tex hyphenation code, as interpreted freely */ +/* the main difference is that there is no attempt to squeeze space */ +/* as tightly at tex does. */ + +static int texit(Tchar *, Tchar *); +static int readpats(void); +static void install(char *); +static void fixup(void); +static int trieindex(int, int); +static int trieindexpart(int); + +#define PARTMAX (27+31) /* latin-1-specific sizes */ +#define TRIEMAX (PARTMAX*PARTMAX+PARTMAX) + +static char pats[50000]; /* size ought to be computed dynamically */ +static char *nextpat = pats; +static char *trie[TRIEMAX]; + +int texhyphen(void) +{ + static int loaded = 0; /* -1: couldn't find tex file */ + + if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */ + return 0; + if (loaded == 0) { + if (readpats()) + loaded = 1; + else + loaded = -1; + } + return texit(wdstart, wdend); +} + +static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */ +{ + int nw, i, k, equal, cnt[500]; + char w[500+1], *np, *pp, *wp, *xpp, *xwp; + + w[0] = '.'; + for (nw = 1; start <= end && nw < 500-1; nw++, start++) + w[nw] = maplow(cbits(*start)); + start -= (nw - 1); + w[nw++] = '.'; + w[nw] = 0; +/* + * printf("try %s\n", w); +*/ + for (i = 0; i <= nw; i++) + cnt[i] = '0'; + + for (wp = w; wp+1 < w+nw; wp++) { + for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) { + if (pp == 0 /* no trie entry */ + || *pp != *wp /* no match on 1st letter */ + || *(pp+1) != *(wp+1)) /* no match on 2nd letter */ + break; /* so move to next letter of word */ + equal = 1; + for (xpp = pp+2, xwp = wp+2; *xpp; ) + if (*xpp++ != *xwp++) { + equal = 0; + break; + } + if (equal) { + np = xpp+1; /* numpat */ + for (k = wp-w; *np; k++, np++) + if (*np > cnt[k]) + cnt[k] = *np; +/* + * printf("match: %s %s\n", pp, xpp+1); +*/ + } + pp += *(pp-1); /* skip over pattern and numbers to next */ + } + } +/* + * for (i = 0; i < nw; i++) printf("%c", w[i]); + * printf(" "); + * for (i = 0; i <= nw; i++) printf("%c", cnt[i]); + * printf("\n"); +*/ +/* + * for (i = 1; i < nw - 1; i++) { + * if (i > 2 && i < nw - 3 && cnt[i] % 2) + * printf("-"); + * if (cbits(start[i-1]) != '.') + * printf("%c", cbits(start[i-1])); + * } + * printf("\n"); +*/ + for (i = 1; i < nw -1; i++) + if (i > 2 && i < nw - 3 && cnt[i] % 2) + *hyp++ = start + i - 1; + return hyp - hyptr; /* non-zero if a hyphen was found */ +} + +/* + This code assumes that hyphen.tex looks like + % some comments + \patterns{ % more comments + pat5ter4ns, 1 per line, SORTED, nothing else + } + more goo + \hyphenation{ % more comments + ex-cep-tions, one per line; i ignore this part for now + } + + this code is NOT robust against variations. unfortunately, + it looks like every local language version of this file has + a different format. i have also made no provision for weird + characters. sigh. +*/ + +static int readpats(void) +{ + FILE *fp; + char buf[200], buf1[200]; + + if ((fp = fopen(TEXHYPHENS, "r")) == NULL + && (fp = fopen(DWBalthyphens, "r")) == NULL) { + ERROR "warning: can't find hyphen.tex" WARN; + return 0; + } + + while (fgets(buf, sizeof buf, fp) != NULL) { + sscanf(buf, "%s", buf1); + if (strcmp(buf1, "\\patterns{") == 0) + break; + } + while (fgets(buf, sizeof buf, fp) != NULL) { + if (buf[0] == '}') + break; + install(buf); + } + fclose(fp); + fixup(); + return 1; +} + +static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */ +{ + int npat, lastpat; + char num[500], *onextpat = nextpat; + + num[0] = '0'; + *nextpat++ = ' '; /* fill in with count later */ + for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) { + if (isdigit(*s)) { + num[npat] = *s; + lastpat = npat; + } else { + *nextpat++ = *s; + npat++; + num[npat] = '0'; + } + } + *nextpat++ = 0; + if (nextpat > pats + sizeof(pats)-20) { + ERROR "tex hyphenation table overflow, tail end ignored" WARN; + nextpat = onextpat; + } + num[lastpat+1] = 0; + strcat(nextpat, num); + nextpat += strlen(nextpat) + 1; +} + +static void fixup(void) /* build indexes of where . a b c ... start */ +{ + char *p, *lastc; + int n; + + for (lastc = pats, p = pats+1; p < nextpat; p++) + if (*p == ' ') { + *lastc = p - lastc; + lastc = p; + } + *lastc = p - lastc; + for (p = pats+1; p < nextpat; ) { + n = trieindex(p[0], p[1]); + if (trie[n] == 0) + trie[n] = p; + p += p[-1]; + } + /* printf("pats = %d\n", nextpat - pats); */ +} + +static int trieindex(int d1, int d2) +{ + int i; + + i = PARTMAX*trieindexpart(d1) + trieindexpart(d2); + if (!(0 <= i && i < TRIEMAX)) { + fprintf(stderr, "i = %d\n", i); + fprintf(stderr, "d1 = %x = %d\n", d1, d1); + fprintf(stderr, "d2 = %x = %d\n", d2, d2); + fprintf(stderr, "part(d1) = %x = %d\n", trieindexpart(d1), trieindexpart(d1)); + fprintf(stderr, "part(d2) = %x = %d\n", trieindexpart(d2), trieindexpart(d2)); + assert(0); + } + return i; +} + +static int trieindexpart(int d) +{ + if (cbits(d) == '.') return 0; + if (cbits(d) <= 'z') return d - 'a' + 1; + else return latcbits(d) - 0xE0 + 27; /* L'à' comes after 'z' */ +} diff --git a/patch/troff-hyphenate-latin-1-fixed/n8.c.orig b/patch/troff-hyphenate-latin-1-fixed/n8.c.orig new file mode 100644 index 0000000..8c2112c --- /dev/null +++ b/patch/troff-hyphenate-latin-1-fixed/n8.c.orig @@ -0,0 +1,545 @@ +#include "tdef.h" +#include "fns.h" +#include "ext.h" +#include <assert.h> + +#define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */ + /* this value is used (as a literal) in suftab.c */ + /* to encode possible hyphenation points in suffixes. */ + /* it could be changed, by widening the tables */ + /* to be shorts instead of chars. */ + +/* + * troff8.c + * + * hyphenation + */ + +int hexsize = 0; /* hyphenation exception list size */ +char *hbufp = NULL; /* base of list */ +char *nexth = NULL; /* first free slot in list */ +Tchar *hyend; + +#define THRESH 160 /* digram goodness threshold */ +int thresh = THRESH; + +int texhyphen(void); +static int alpha(Tchar); + +void hyphen(Tchar *wp) +{ + int j; + Tchar *i; + + i = wp; + while (punct((*i++))) + ; + if (!alpha(*--i)) + return; + wdstart = i++; + while (alpha(*i++)) + ; + hyend = wdend = --i - 1; + while (punct((*i++))) + ; + if (*--i) + return; + if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */ + return; + hyp = hyptr; + *hyp = 0; + hyoff = 2; + + /* for now, try exceptions first, then tex (if hyphalg is non-zero), + then suffix and digram if tex didn't hyphenate it at all. + */ + + if (!exword() && !texhyphen() && !suffix()) + digram(); + + /* this appears to sort hyphenation points into increasing order */ + *hyp++ = 0; + if (*hyptr) + for (j = 1; j; ) { + j = 0; + for (hyp = hyptr + 1; *hyp != 0; hyp++) { + if (*(hyp - 1) > *hyp) { + j++; + i = *hyp; + *hyp = *(hyp - 1); + *(hyp - 1) = i; + } + } + } +} + +static alpha(Tchar i) /* non-zero if really alphabetic */ +{ + if (ismot(i)) + return 0; + else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */ + return 0; /* no good way to make sure i is in range for */ + else /* the call of isalpha */ + return isalpha(cbits(i)); +} + + +punct(Tchar i) +{ + if (!i || alpha(i)) + return(0); + else + return(1); +} + + +void caseha(void) /* set hyphenation algorithm */ +{ + hyphalg = HYPHALG; + if (skip()) + return; + noscale++; + hyphalg = atoi0(); + noscale = 0; +} + + +void caseht(void) /* set hyphenation threshold; not in manual! */ +{ + thresh = THRESH; + if (skip()) + return; + noscale++; + thresh = atoi0(); + noscale = 0; +} + + +char *growh(char *where) +{ + char *new; + + hexsize += NHEX; + if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL) + return NULL; + if (new == hbufp) { + return where; + } else { + int diff; + diff = where - hbufp; + hbufp = new; + return new + diff; + } +} + + +void casehw(void) +{ + int i, k; + char *j; + Tchar t; + + if (nexth == NULL) { + if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) { + ERROR "No space for exception word list." WARN; + return; + } + hexsize = NHEX; + } + k = 0; + while (!skip()) { + if ((j = nexth) >= hbufp + hexsize - 2) + if ((j = nexth = growh(j)) == NULL) + goto full; + for (;;) { + if (ismot(t = getch())) + continue; + i = cbits(t); + if (i == ' ' || i == '\n') { + *j++ = 0; + nexth = j; + *j = 0; + if (i == ' ') + break; + else + return; + } + if (i == '-') { + k = HY_BIT; + continue; + } + *j++ = maplow(i) | k; + k = 0; + if (j >= hbufp + hexsize - 2) + if ((j = growh(j)) == NULL) + goto full; + } + } + return; +full: + ERROR "Cannot grow exception word list." WARN; + *nexth = 0; +} + + +int exword(void) +{ + Tchar *w; + char *e, *save; + + e = hbufp; + while (1) { + save = e; + if (e == NULL || *e == 0) + return(0); + w = wdstart; + while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) { + e++; + w++; + } + if (!*e) { + if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) { + w = wdstart; + for (e = save; *e; e++) { + if (*e & HY_BIT) + *hyp++ = w; + if (hyp > hyptr + NHYP - 1) + hyp = hyptr + NHYP - 1; + w++; + } + return(1); + } else { + e++; + continue; + } + } else + while (*e++) + ; + } +} + + +suffix(void) +{ + Tchar *w; + char *s, *s0; + Tchar i; + extern char *suftab[]; + +again: + i = cbits(*hyend); + if (!alpha(i)) + return(0); + if (i < 'a') + i -= 'A' - 'a'; + if ((s0 = suftab[i-'a']) == 0) + return(0); + for (;;) { + if ((i = *s0 & 017) == 0) + return(0); + s = s0 + i - 1; + w = hyend - 1; + while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) { + s--; + w--; + } + if (s == s0) + break; + s0 += i; + } + s = s0 + i - 1; + w = hyend; + if (*s0 & HY_BIT) + goto mark; + while (s > s0) { + w--; + if (*s-- & HY_BIT) { +mark: + hyend = w - 1; + if (*s0 & 0100) /* 0100 used in suftab to encode something too */ + continue; + if (!chkvow(w)) + return(0); + *hyp++ = w; + } + } + if (*s0 & 040) + return(0); + if (exword()) + return(1); + goto again; +} + + +maplow(int i) +{ + if (isupper(i)) + i = tolower(i); + return(i); +} + + +vowel(int i) +{ + switch (i) { + case 'a': case 'A': + case 'e': case 'E': + case 'i': case 'I': + case 'o': case 'O': + case 'u': case 'U': + case 'y': case 'Y': + return(1); + default: + return(0); + } +} + + +Tchar *chkvow(Tchar *w) +{ + while (--w >= wdstart) + if (vowel(cbits(*w))) + return(w); + return(0); +} + + +void digram(void) +{ + Tchar *w; + int val; + Tchar *nhyend, *maxw; + int maxval; + extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13]; + +again: + if (!(w = chkvow(hyend + 1))) + return; + hyend = w; + if (!(w = chkvow(hyend))) + return; + nhyend = w; + maxval = 0; + w--; + while (++w < hyend && w < wdend - 1) { + val = 1; + if (w == wdstart) + val *= dilook('a', cbits(*w), bxh); + else if (w == wdstart + 1) + val *= dilook(cbits(*(w-1)), cbits(*w), bxxh); + else + val *= dilook(cbits(*(w-1)), cbits(*w), xxh); + val *= dilook(cbits(*w), cbits(*(w+1)), xhx); + val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx); + if (val > maxval) { + maxval = val; + maxw = w + 1; + } + } + hyend = nhyend; + if (maxval > thresh) + *hyp++ = maxw; + goto again; +} + + +dilook(int a, int b, char t[26][13]) +{ + int i, j; + + i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2]; + if (!(j & 01)) + i >>= 4; + return(i & 017); +} + + +/* here beginneth the tex hyphenation code, as interpreted freely */ +/* the main difference is that there is no attempt to squeeze space */ +/* as tightly at tex does. */ + +static int texit(Tchar *, Tchar *); +static int readpats(void); +static void install(char *); +static void fixup(void); +static int trieindex(int, int); + +static char pats[50000]; /* size ought to be computed dynamically */ +static char *nextpat = pats; +static char *trie[27*27]; /* english-specific sizes */ + +int texhyphen(void) +{ + static int loaded = 0; /* -1: couldn't find tex file */ + + if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */ + return 0; + if (loaded == 0) { + if (readpats()) + loaded = 1; + else + loaded = -1; + } + return texit(wdstart, wdend); +} + +static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */ +{ + int nw, i, k, equal, cnt[500]; + char w[500+1], *np, *pp, *wp, *xpp, *xwp; + + w[0] = '.'; + for (nw = 1; start <= end && nw < 500-1; nw++, start++) + w[nw] = maplow(tolower(cbits(*start))); + start -= (nw - 1); + w[nw++] = '.'; + w[nw] = 0; +/* + * printf("try %s\n", w); +*/ + for (i = 0; i <= nw; i++) + cnt[i] = '0'; + + for (wp = w; wp+1 < w+nw; wp++) { + for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) { + if (pp == 0 /* no trie entry */ + || *pp != *wp /* no match on 1st letter */ + || *(pp+1) != *(wp+1)) /* no match on 2nd letter */ + break; /* so move to next letter of word */ + equal = 1; + for (xpp = pp+2, xwp = wp+2; *xpp; ) + if (*xpp++ != *xwp++) { + equal = 0; + break; + } + if (equal) { + np = xpp+1; /* numpat */ + for (k = wp-w; *np; k++, np++) + if (*np > cnt[k]) + cnt[k] = *np; +/* + * printf("match: %s %s\n", pp, xpp+1); +*/ + } + pp += *(pp-1); /* skip over pattern and numbers to next */ + } + } +/* + * for (i = 0; i < nw; i++) printf("%c", w[i]); + * printf(" "); + * for (i = 0; i <= nw; i++) printf("%c", cnt[i]); + * printf("\n"); +*/ +/* + * for (i = 1; i < nw - 1; i++) { + * if (i > 2 && i < nw - 3 && cnt[i] % 2) + * printf("-"); + * if (cbits(start[i-1]) != '.') + * printf("%c", cbits(start[i-1])); + * } + * printf("\n"); +*/ + for (i = 1; i < nw -1; i++) + if (i > 2 && i < nw - 3 && cnt[i] % 2) + *hyp++ = start + i - 1; + return hyp - hyptr; /* non-zero if a hyphen was found */ +} + +/* + This code assumes that hyphen.tex looks like + % some comments + \patterns{ % more comments + pat5ter4ns, 1 per line, SORTED, nothing else + } + more goo + \hyphenation{ % more comments + ex-cep-tions, one per line; i ignore this part for now + } + + this code is NOT robust against variations. unfortunately, + it looks like every local language version of this file has + a different format. i have also made no provision for weird + characters. sigh. +*/ + +static int readpats(void) +{ + FILE *fp; + char buf[200], buf1[200]; + + if ((fp = fopen(TEXHYPHENS, "r")) == NULL + && (fp = fopen(DWBalthyphens, "r")) == NULL) { + ERROR "warning: can't find hyphen.tex" WARN; + return 0; + } + + while (fgets(buf, sizeof buf, fp) != NULL) { + sscanf(buf, "%s", buf1); + if (strcmp(buf1, "\\patterns{") == 0) + break; + } + while (fgets(buf, sizeof buf, fp) != NULL) { + if (buf[0] == '}') + break; + install(buf); + } + fclose(fp); + fixup(); + return 1; +} + +static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */ +{ + int npat, lastpat; + char num[500], *onextpat = nextpat; + + num[0] = '0'; + *nextpat++ = ' '; /* fill in with count later */ + for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) { + if (isdigit(*s)) { + num[npat] = *s; + lastpat = npat; + } else { + *nextpat++ = *s; + npat++; + num[npat] = '0'; + } + } + *nextpat++ = 0; + if (nextpat > pats + sizeof(pats)-20) { + ERROR "tex hyphenation table overflow, tail end ignored" WARN; + nextpat = onextpat; + } + num[lastpat+1] = 0; + strcat(nextpat, num); + nextpat += strlen(nextpat) + 1; +} + +static void fixup(void) /* build indexes of where . a b c ... start */ +{ + char *p, *lastc; + int n; + + for (lastc = pats, p = pats+1; p < nextpat; p++) + if (*p == ' ') { + *lastc = p - lastc; + lastc = p; + } + *lastc = p - lastc; + for (p = pats+1; p < nextpat; ) { + n = trieindex(p[0], p[1]); + if (trie[n] == 0) + trie[n] = p; + p += p[-1]; + } + /* printf("pats = %d\n", nextpat - pats); */ +} + +static int trieindex(int d1, int d2) +{ + int i; + + i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1); + assert(0 <= i && i < 27*27); + return i; +} diff --git a/patch/troff-hyphenate-latin-1-fixed/notes b/patch/troff-hyphenate-latin-1-fixed/notes new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/patch/troff-hyphenate-latin-1-fixed/notes diff --git a/patch/troff-hyphenate-latin-1-fixed/readme b/patch/troff-hyphenate-latin-1-fixed/readme new file mode 100644 index 0000000..0d3bd32 --- /dev/null +++ b/patch/troff-hyphenate-latin-1-fixed/readme @@ -0,0 +1,9 @@ +Add support for Latin-1 characters in hyphenation algorithm + +Ideally, it would support UTF-8, but Latin-1 is a big improvement +over ASCII nonetheless. + +This patch disables the suffix function, which I haven't (yet) gotten +to work with Latin-1. Suffix is, however, hard-coded for English, +and, in any case, I'm not sure how important it is. In the meantime, +it isn't a huge loss. |