summaryrefslogtreecommitdiff
path: root/patch/troff-hyphenate-latin-1-fixed/n8.c
diff options
context:
space:
mode:
Diffstat (limited to 'patch/troff-hyphenate-latin-1-fixed/n8.c')
-rw-r--r--patch/troff-hyphenate-latin-1-fixed/n8.c584
1 files changed, 584 insertions, 0 deletions
diff --git a/patch/troff-hyphenate-latin-1-fixed/n8.c b/patch/troff-hyphenate-latin-1-fixed/n8.c
new file mode 100644
index 0000000..b503fdf
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/n8.c
@@ -0,0 +1,584 @@
+#include "tdef.h"
+#include "fns.h"
+#include "ext.h"
+#include <assert.h>
+
+#define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
+ /* this value is used (as a literal) in suftab.c */
+ /* to encode possible hyphenation points in suffixes. */
+ /* it could be changed, by widening the tables */
+ /* to be shorts instead of chars. */
+
+/*
+ * troff8.c
+ *
+ * hyphenation
+ */
+
+int hexsize = 0; /* hyphenation exception list size */
+char *hbufp = NULL; /* base of list */
+char *nexth = NULL; /* first free slot in list */
+Tchar *hyend;
+
+#define LATIN 256
+#define latcbits(i) ((i)+1 & 0x000FF) /* for some reason, extra chars */
+ /* from latin-1 are off by one */
+
+#define THRESH 160 /* digram goodness threshold */
+int thresh = THRESH;
+
+int texhyphen(void);
+static int alpha(Tchar);
+
+void hyphen(Tchar *wp)
+{
+ int j;
+ Tchar *i;
+
+ i = wp;
+ while (punct((*i++)))
+ ;
+ if (!alpha(*--i))
+ return;
+ wdstart = i++;
+ while (alpha(*i++))
+ ;
+ hyend = wdend = --i - 1;
+ while (punct((*i++)))
+ ;
+ if (*--i)
+ return;
+ if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
+ return;
+ hyp = hyptr;
+ *hyp = 0;
+ hyoff = 2;
+
+ /* for now, try exceptions first, then tex (if hyphalg is non-zero),
+ then suffix and digram if tex didn't hyphenate it at all.
+ */
+
+ //if (!exword() && !texhyphen() && !suffix())
+ if (!exword() && !texhyphen())
+ digram();
+
+ /* this appears to sort hyphenation points into increasing order */
+ *hyp++ = 0;
+ if (*hyptr)
+ for (j = 1; j; ) {
+ j = 0;
+ for (hyp = hyptr + 1; *hyp != 0; hyp++) {
+ if (*(hyp - 1) > *hyp) {
+ j++;
+ i = *hyp;
+ *hyp = *(hyp - 1);
+ *(hyp - 1) = i;
+ }
+ }
+ }
+}
+
+static alpha(Tchar i) /* non-zero if really alphabetic */
+{
+ if (ismot(i))
+ return 0;
+ else if (latcbits(i) > LATIN) /* this isn't very elegant, but there's */
+ return 0; /* no good way to make sure i is in range for */
+ else if (latcbits(i) >= LATIN-64) { /* the call of isalpha */
+ return (latcbits(i) != 0xD7 && latcbits(i) != 0xF7);
+ } else
+ return isalpha(cbits(i));
+}
+
+
+punct(Tchar i)
+{
+ if (!i || alpha(i))
+ return(0);
+ else
+ return(1);
+}
+
+
+void caseha(void) /* set hyphenation algorithm */
+{
+ hyphalg = HYPHALG;
+ if (skip())
+ return;
+ noscale++;
+ hyphalg = atoi0();
+ noscale = 0;
+}
+
+
+void caseht(void) /* set hyphenation threshold; not in manual! */
+{
+ thresh = THRESH;
+ if (skip())
+ return;
+ noscale++;
+ thresh = atoi0();
+ noscale = 0;
+}
+
+
+char *growh(char *where)
+{
+ char *new;
+
+ hexsize += NHEX;
+ if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
+ return NULL;
+ if (new == hbufp) {
+ return where;
+ } else {
+ int diff;
+ diff = where - hbufp;
+ hbufp = new;
+ return new + diff;
+ }
+}
+
+
+void casehw(void)
+{
+ int i, k;
+ char *j;
+ Tchar t;
+
+ if (nexth == NULL) {
+ if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
+ ERROR "No space for exception word list." WARN;
+ return;
+ }
+ hexsize = NHEX;
+ }
+ k = 0;
+ while (!skip()) {
+ if ((j = nexth) >= hbufp + hexsize - 2)
+ if ((j = nexth = growh(j)) == NULL)
+ goto full;
+ for (;;) {
+ if (ismot(t = getch()))
+ continue;
+ i = cbits(t);
+ if (i == ' ' || i == '\n') {
+ *j++ = 0;
+ nexth = j;
+ *j = 0;
+ if (i == ' ')
+ break;
+ else
+ return;
+ }
+ if (i == '-') {
+ k = HY_BIT;
+ continue;
+ }
+ *j++ = maplow(i) | k;
+ k = 0;
+ if (j >= hbufp + hexsize - 2)
+ if ((j = growh(j)) == NULL)
+ goto full;
+ }
+ }
+ return;
+full:
+ ERROR "Cannot grow exception word list." WARN;
+ *nexth = 0;
+}
+
+
+int exword(void)
+{
+ Tchar *w;
+ char *e, *save;
+
+ e = hbufp;
+ while (1) {
+ save = e;
+ if (e == NULL || *e == 0)
+ return(0);
+ w = wdstart;
+ while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
+ e++;
+ w++;
+ }
+ if (!*e) {
+ if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
+ w = wdstart;
+ for (e = save; *e; e++) {
+ if (*e & HY_BIT)
+ *hyp++ = w;
+ if (hyp > hyptr + NHYP - 1)
+ hyp = hyptr + NHYP - 1;
+ w++;
+ }
+ return(1);
+ } else {
+ e++;
+ continue;
+ }
+ } else
+ while (*e++)
+ ;
+ }
+}
+
+
+suffix(void)
+{
+ Tchar *w;
+ char *s, *s0;
+ Tchar i;
+ extern char *suftab[];
+
+again:
+ i = cbits(*hyend);
+ if (!alpha(i))
+ return(0);
+ if (i < 'a')
+ i -= 'A' - 'a';
+ if ((s0 = suftab[i-'a']) == 0)
+ return(0);
+ for (;;) {
+ if ((i = *s0 & 017) == 0)
+ return(0);
+ s = s0 + i - 1;
+ w = hyend - 1;
+ while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
+ s--;
+ w--;
+ }
+ if (s == s0)
+ break;
+ s0 += i;
+ }
+ s = s0 + i - 1;
+ w = hyend;
+ if (*s0 & HY_BIT)
+ goto mark;
+ while (s > s0) {
+ w--;
+ if (*s-- & HY_BIT) {
+mark:
+ hyend = w - 1;
+ if (*s0 & 0100) /* 0100 used in suftab to encode something too */
+ continue;
+ if (!chkvow(w))
+ return(0);
+ *hyp++ = w;
+ }
+ }
+ if (*s0 & 040)
+ return(0);
+ if (exword())
+ return(1);
+ goto again;
+}
+
+
+maplow(int i)
+{
+ if (isupper(i))
+ i = tolower(i);
+ else if ((latcbits(i) >= 0xC0 && latcbits(i) <= 0xD6) || (latcbits(i) >= 0xD8 && latcbits(i) <= 0xDD))
+ i = tolower(latcbits(i));
+ return(i);
+}
+
+
+vowel(int i)
+{
+ int j = latcbits(i);
+ if (j >= 0xC0 && j <= 0xC6 || /* uppercase */
+ j >= 0xC8 && j <= 0xCF ||
+ j >= 0xD2 && j <= 0xD6 ||
+ j >= 0xD8 && j <= 0xDD ||
+ j >= 0xE0 && j <= 0xE6 || /* lowercase */
+ j >= 0xE8 && j <= 0xEF ||
+ j >= 0xF2 && j <= 0xF6 ||
+ j >= 0xF8 && j <= 0xFD ||
+ j == 0xFF)
+ return 1;
+
+ switch (i) {
+ case 'a': case 'A':
+ case 'e': case 'E':
+ case 'i': case 'I':
+ case 'o': case 'O':
+ case 'u': case 'U':
+ case 'y': case 'Y':
+ return(1);
+ default:
+ return(0);
+ }
+}
+
+
+Tchar *chkvow(Tchar *w)
+{
+ while (--w >= wdstart)
+ if (vowel(cbits(*w)))
+ return(w);
+ return(0);
+}
+
+
+void digram(void)
+{
+ Tchar *w;
+ int val;
+ Tchar *nhyend, *maxw;
+ int maxval;
+ extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
+
+again:
+ if (!(w = chkvow(hyend + 1)))
+ return;
+ hyend = w;
+ if (!(w = chkvow(hyend)))
+ return;
+ nhyend = w;
+ maxval = 0;
+ w--;
+ while (++w < hyend && w < wdend - 1) {
+ val = 1;
+ if (w == wdstart)
+ val *= dilook('a', cbits(*w), bxh);
+ else if (w == wdstart + 1)
+ val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
+ else
+ val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
+ val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
+ val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
+ if (val > maxval) {
+ maxval = val;
+ maxw = w + 1;
+ }
+ }
+ hyend = nhyend;
+ if (maxval > thresh)
+ *hyp++ = maxw;
+ goto again;
+}
+
+
+dilook(int a, int b, char t[26][13])
+{
+ int i, j;
+
+ i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
+ if (!(j & 01))
+ i >>= 4;
+ return(i & 017);
+}
+
+
+/* here beginneth the tex hyphenation code, as interpreted freely */
+/* the main difference is that there is no attempt to squeeze space */
+/* as tightly at tex does. */
+
+static int texit(Tchar *, Tchar *);
+static int readpats(void);
+static void install(char *);
+static void fixup(void);
+static int trieindex(int, int);
+static int trieindexpart(int);
+
+#define PARTMAX (27+31) /* latin-1-specific sizes */
+#define TRIEMAX (PARTMAX*PARTMAX+PARTMAX)
+
+static char pats[50000]; /* size ought to be computed dynamically */
+static char *nextpat = pats;
+static char *trie[TRIEMAX];
+
+int texhyphen(void)
+{
+ static int loaded = 0; /* -1: couldn't find tex file */
+
+ if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
+ return 0;
+ if (loaded == 0) {
+ if (readpats())
+ loaded = 1;
+ else
+ loaded = -1;
+ }
+ return texit(wdstart, wdend);
+}
+
+static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
+{
+ int nw, i, k, equal, cnt[500];
+ char w[500+1], *np, *pp, *wp, *xpp, *xwp;
+
+ w[0] = '.';
+ for (nw = 1; start <= end && nw < 500-1; nw++, start++)
+ w[nw] = maplow(cbits(*start));
+ start -= (nw - 1);
+ w[nw++] = '.';
+ w[nw] = 0;
+/*
+ * printf("try %s\n", w);
+*/
+ for (i = 0; i <= nw; i++)
+ cnt[i] = '0';
+
+ for (wp = w; wp+1 < w+nw; wp++) {
+ for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
+ if (pp == 0 /* no trie entry */
+ || *pp != *wp /* no match on 1st letter */
+ || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
+ break; /* so move to next letter of word */
+ equal = 1;
+ for (xpp = pp+2, xwp = wp+2; *xpp; )
+ if (*xpp++ != *xwp++) {
+ equal = 0;
+ break;
+ }
+ if (equal) {
+ np = xpp+1; /* numpat */
+ for (k = wp-w; *np; k++, np++)
+ if (*np > cnt[k])
+ cnt[k] = *np;
+/*
+ * printf("match: %s %s\n", pp, xpp+1);
+*/
+ }
+ pp += *(pp-1); /* skip over pattern and numbers to next */
+ }
+ }
+/*
+ * for (i = 0; i < nw; i++) printf("%c", w[i]);
+ * printf(" ");
+ * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
+ * printf("\n");
+*/
+/*
+ * for (i = 1; i < nw - 1; i++) {
+ * if (i > 2 && i < nw - 3 && cnt[i] % 2)
+ * printf("-");
+ * if (cbits(start[i-1]) != '.')
+ * printf("%c", cbits(start[i-1]));
+ * }
+ * printf("\n");
+*/
+ for (i = 1; i < nw -1; i++)
+ if (i > 2 && i < nw - 3 && cnt[i] % 2)
+ *hyp++ = start + i - 1;
+ return hyp - hyptr; /* non-zero if a hyphen was found */
+}
+
+/*
+ This code assumes that hyphen.tex looks like
+ % some comments
+ \patterns{ % more comments
+ pat5ter4ns, 1 per line, SORTED, nothing else
+ }
+ more goo
+ \hyphenation{ % more comments
+ ex-cep-tions, one per line; i ignore this part for now
+ }
+
+ this code is NOT robust against variations. unfortunately,
+ it looks like every local language version of this file has
+ a different format. i have also made no provision for weird
+ characters. sigh.
+*/
+
+static int readpats(void)
+{
+ FILE *fp;
+ char buf[200], buf1[200];
+
+ if ((fp = fopen(TEXHYPHENS, "r")) == NULL
+ && (fp = fopen(DWBalthyphens, "r")) == NULL) {
+ ERROR "warning: can't find hyphen.tex" WARN;
+ return 0;
+ }
+
+ while (fgets(buf, sizeof buf, fp) != NULL) {
+ sscanf(buf, "%s", buf1);
+ if (strcmp(buf1, "\\patterns{") == 0)
+ break;
+ }
+ while (fgets(buf, sizeof buf, fp) != NULL) {
+ if (buf[0] == '}')
+ break;
+ install(buf);
+ }
+ fclose(fp);
+ fixup();
+ return 1;
+}
+
+static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
+{
+ int npat, lastpat;
+ char num[500], *onextpat = nextpat;
+
+ num[0] = '0';
+ *nextpat++ = ' '; /* fill in with count later */
+ for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
+ if (isdigit(*s)) {
+ num[npat] = *s;
+ lastpat = npat;
+ } else {
+ *nextpat++ = *s;
+ npat++;
+ num[npat] = '0';
+ }
+ }
+ *nextpat++ = 0;
+ if (nextpat > pats + sizeof(pats)-20) {
+ ERROR "tex hyphenation table overflow, tail end ignored" WARN;
+ nextpat = onextpat;
+ }
+ num[lastpat+1] = 0;
+ strcat(nextpat, num);
+ nextpat += strlen(nextpat) + 1;
+}
+
+static void fixup(void) /* build indexes of where . a b c ... start */
+{
+ char *p, *lastc;
+ int n;
+
+ for (lastc = pats, p = pats+1; p < nextpat; p++)
+ if (*p == ' ') {
+ *lastc = p - lastc;
+ lastc = p;
+ }
+ *lastc = p - lastc;
+ for (p = pats+1; p < nextpat; ) {
+ n = trieindex(p[0], p[1]);
+ if (trie[n] == 0)
+ trie[n] = p;
+ p += p[-1];
+ }
+ /* printf("pats = %d\n", nextpat - pats); */
+}
+
+static int trieindex(int d1, int d2)
+{
+ int i;
+
+ i = PARTMAX*trieindexpart(d1) + trieindexpart(d2);
+ if (!(0 <= i && i < TRIEMAX)) {
+ fprintf(stderr, "i = %d\n", i);
+ fprintf(stderr, "d1 = %x = %d\n", d1, d1);
+ fprintf(stderr, "d2 = %x = %d\n", d2, d2);
+ fprintf(stderr, "part(d1) = %x = %d\n", trieindexpart(d1), trieindexpart(d1));
+ fprintf(stderr, "part(d2) = %x = %d\n", trieindexpart(d2), trieindexpart(d2));
+ assert(0);
+ }
+ return i;
+}
+
+static int trieindexpart(int d)
+{
+ if (cbits(d) == '.') return 0;
+ if (cbits(d) <= 'z') return d - 'a' + 1;
+ else return latcbits(d) - 0xE0 + 27; /* L'à' comes after 'z' */
+}