Add various patches

author: John Ankarström <john@ankarstrom.se> 2021-01-28 21:33:30 +0000
committer: John Ankarström <john@ankarstrom.se> 2021-01-28 21:33:30 +0000
commit: 51b5b02f52cc6f314029f3f2fe97afdc26ba0f25 (patch)
tree: 711d1efadeb78ad9b2d9f7b629350358838bc626 /patch/troff-hyphenate-latin-1-fixed
parent: f3fd330cddade1c66d0f101d5cc6f657c4cd1bb6 (diff)
download: plan9-51b5b02f52cc6f314029f3f2fe97afdc26ba0f25.tar.gz
6 files changed, 1140 insertions, 0 deletions
diff --git a/patch/troff-hyphenate-latin-1-fixed/email b/patch/troff-hyphenate-latin-1-fixed/email
new file mode 100644
index 0000000..191feb6
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/email
@@ -0,0 +1 @@
+john@ankarstrom.se
diff --git a/patch/troff-hyphenate-latin-1-fixed/files b/patch/troff-hyphenate-latin-1-fixed/files
new file mode 100644
index 0000000..48b7a8b
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/files
@@ -0,0 +1 @@
+/sys/src/cmd/troff/n8.c n8.c
diff --git a/patch/troff-hyphenate-latin-1-fixed/n8.c b/patch/troff-hyphenate-latin-1-fixed/n8.c
new file mode 100644
index 0000000..b503fdf
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/n8.c
@@ -0,0 +1,584 @@
+#include "tdef.h"
+#include "fns.h"
+#include "ext.h"
+#include <assert.h>
+
+#define	HY_BIT	0200	/* stuff in here only works for 7-bit ascii */
+			/* this value is used (as a literal) in suftab.c */
+			/* to encode possible hyphenation points in suffixes. */
+			/* it could be changed, by widening the tables */
+			/* to be shorts instead of chars. */
+
+/*
+ * troff8.c
+ *
+ * hyphenation
+ */
+
+int	hexsize = 0;		/* hyphenation exception list size */
+char	*hbufp = NULL;		/* base of list */
+char	*nexth = NULL;		/* first free slot in list */
+Tchar	*hyend;
+
+#define LATIN 256
+#define latcbits(i) ((i)+1 & 0x000FF)	/* for some reason, extra chars */
+					/* from latin-1 are off by one */
+
+#define THRESH 160 		/* digram goodness threshold */
+int	thresh = THRESH;
+
+int	texhyphen(void);
+static	int	alpha(Tchar);
+
+void hyphen(Tchar *wp)
+{
+	int j;
+	Tchar *i;
+
+	i = wp;
+	while (punct((*i++)))
+		;
+	if (!alpha(*--i))
+		return;
+	wdstart = i++;
+	while (alpha(*i++))
+		;
+	hyend = wdend = --i - 1;
+	while (punct((*i++)))
+		;
+	if (*--i)
+		return;
+	if (wdend - wdstart < 4)	/* 4 chars is too short to hyphenate */
+		return;
+	hyp = hyptr;
+	*hyp = 0;
+	hyoff = 2;
+
+	/* for now, try exceptions first, then tex (if hyphalg is non-zero),
+	   then suffix and digram if tex didn't hyphenate it at all.
+	*/
+
+	//if (!exword() && !texhyphen() && !suffix())
+	if (!exword() && !texhyphen())
+		digram();
+
+	/* this appears to sort hyphenation points into increasing order */
+	*hyp++ = 0;
+	if (*hyptr)
+		for (j = 1; j; ) {
+			j = 0;
+			for (hyp = hyptr + 1; *hyp != 0; hyp++) {
+				if (*(hyp - 1) > *hyp) {
+					j++;
+					i = *hyp;
+					*hyp = *(hyp - 1);
+					*(hyp - 1) = i;
+				}
+			}
+		}
+}
+
+static alpha(Tchar i)	/* non-zero if really alphabetic */
+{
+	if (ismot(i))
+		return 0;
+	else if (latcbits(i) > LATIN)		/* this isn't very elegant, but there's */
+		return 0;			/* no good way to make sure i is in range for */
+	else if (latcbits(i) >= LATIN-64) {	/* the call of isalpha */
+		return (latcbits(i) != 0xD7 && latcbits(i) != 0xF7);
+	} else
+		return isalpha(cbits(i));
+}
+
+
+punct(Tchar i)
+{
+	if (!i || alpha(i))
+		return(0);
+	else
+		return(1);
+}
+
+
+void caseha(void)	/* set hyphenation algorithm */
+{
+	hyphalg = HYPHALG;
+	if (skip())
+		return;
+	noscale++;
+	hyphalg = atoi0();
+	noscale = 0;
+}
+
+
+void caseht(void)	/* set hyphenation threshold;  not in manual! */
+{
+	thresh = THRESH;
+	if (skip())
+		return;
+	noscale++;
+	thresh = atoi0();
+	noscale = 0;
+}
+
+
+char *growh(char *where)
+{
+	char *new;
+
+	hexsize += NHEX;
+	if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
+		return NULL;
+	if (new == hbufp) {
+		return where;
+	} else {
+		int diff;
+		diff = where - hbufp;
+		hbufp = new;
+		return new + diff;
+	}
+}
+
+
+void casehw(void)
+{
+	int i, k;
+	char *j;
+	Tchar t;
+
+	if (nexth == NULL) {
+		if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
+			ERROR "No space for exception word list." WARN;
+			return;
+		}
+		hexsize = NHEX;
+	}
+	k = 0;
+	while (!skip()) {
+		if ((j = nexth) >= hbufp + hexsize - 2)
+			if ((j = nexth = growh(j)) == NULL)
+				goto full;
+		for (;;) {
+			if (ismot(t = getch()))
+				continue;
+			i = cbits(t);
+			if (i == ' ' || i == '\n') {
+				*j++ = 0;
+				nexth = j;
+				*j = 0;
+				if (i == ' ')
+					break;
+				else
+					return;
+			}
+			if (i == '-') {
+				k = HY_BIT;
+				continue;
+			}
+			*j++ = maplow(i) | k;
+			k = 0;
+			if (j >= hbufp + hexsize - 2)
+				if ((j = growh(j)) == NULL)
+					goto full;
+		}
+	}
+	return;
+full:
+	ERROR "Cannot grow exception word list." WARN;
+	*nexth = 0;
+}
+
+
+int exword(void)
+{
+	Tchar *w;
+	char *e, *save;
+
+	e = hbufp;
+	while (1) {
+		save = e;
+		if (e == NULL || *e == 0)
+			return(0);
+		w = wdstart;
+		while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
+			e++;
+			w++;
+		}
+		if (!*e) {
+			if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
+				w = wdstart;
+				for (e = save; *e; e++) {
+					if (*e & HY_BIT)
+						*hyp++ = w;
+					if (hyp > hyptr + NHYP - 1)
+						hyp = hyptr + NHYP - 1;
+					w++;
+				}
+				return(1);
+			} else {
+				e++;
+				continue;
+			}
+		} else
+			while (*e++)
+				;
+	}
+}
+
+
+suffix(void)
+{
+	Tchar *w;
+	char *s, *s0;
+	Tchar i;
+	extern char *suftab[];
+
+again:
+	i = cbits(*hyend);
+	if (!alpha(i))
+		return(0);
+	if (i < 'a')
+		i -= 'A' - 'a';
+	if ((s0 = suftab[i-'a']) == 0)
+		return(0);
+	for (;;) {
+		if ((i = *s0 & 017) == 0)
+			return(0);
+		s = s0 + i - 1;
+		w = hyend - 1;
+		while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
+			s--;
+			w--;
+		}
+		if (s == s0)
+			break;
+		s0 += i;
+	}
+	s = s0 + i - 1;
+	w = hyend;
+	if (*s0 & HY_BIT)
+		goto mark;
+	while (s > s0) {
+		w--;
+		if (*s-- & HY_BIT) {
+mark:
+			hyend = w - 1;
+			if (*s0 & 0100)	/* 0100 used in suftab to encode something too */
+				continue;
+			if (!chkvow(w))
+				return(0);
+			*hyp++ = w;
+		}
+	}
+	if (*s0 & 040)
+		return(0);
+	if (exword())
+		return(1);
+	goto again;
+}
+
+
+maplow(int i)
+{
+	if (isupper(i))
+		i = tolower(i);
+	else if ((latcbits(i) >= 0xC0 && latcbits(i) <= 0xD6) || (latcbits(i) >= 0xD8 && latcbits(i) <= 0xDD))
+		i = tolower(latcbits(i));
+	return(i);
+}
+
+
+vowel(int i)
+{
+	int j = latcbits(i);
+	if (j >= 0xC0 && j <= 0xC6 || /* uppercase */
+	    j >= 0xC8 && j <= 0xCF ||
+	    j >= 0xD2 && j <= 0xD6 ||
+	    j >= 0xD8 && j <= 0xDD ||
+	    j >= 0xE0 && j <= 0xE6 || /* lowercase */
+	    j >= 0xE8 && j <= 0xEF ||
+	    j >= 0xF2 && j <= 0xF6 ||
+	    j >= 0xF8 && j <= 0xFD ||
+	    j == 0xFF)
+		return 1;
+
+	switch (i) {
+	case 'a': case 'A':
+	case 'e': case 'E':
+	case 'i': case 'I':
+	case 'o': case 'O':
+	case 'u': case 'U':
+	case 'y': case 'Y':
+		return(1);
+	default:
+		return(0);
+	}
+}
+
+
+Tchar *chkvow(Tchar *w)
+{
+	while (--w >= wdstart)
+		if (vowel(cbits(*w)))
+			return(w);
+	return(0);
+}
+
+
+void digram(void)
+{
+	Tchar *w;
+	int val;
+	Tchar *nhyend, *maxw;
+	int maxval;
+	extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
+
+again:
+	if (!(w = chkvow(hyend + 1)))
+		return;
+	hyend = w;
+	if (!(w = chkvow(hyend)))
+		return;
+	nhyend = w;
+	maxval = 0;
+	w--;
+	while (++w < hyend && w < wdend - 1) {
+		val = 1;
+		if (w == wdstart)
+			val *= dilook('a', cbits(*w), bxh);
+		else if (w == wdstart + 1)
+			val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
+		else
+			val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
+		val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
+		val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
+		if (val > maxval) {
+			maxval = val;
+			maxw = w + 1;
+		}
+	}
+	hyend = nhyend;
+	if (maxval > thresh)
+		*hyp++ = maxw;
+	goto again;
+}
+
+
+dilook(int a, int b, char t[26][13])
+{
+	int i, j;
+
+	i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
+	if (!(j & 01))
+		i >>= 4;
+	return(i & 017);
+}
+
+
+/* here beginneth the tex hyphenation code, as interpreted freely */
+/* the main difference is that there is no attempt to squeeze space */
+/* as tightly at tex does. */
+
+static int	texit(Tchar *, Tchar *);
+static int	readpats(void);
+static void	install(char *);
+static void	fixup(void);
+static int	trieindex(int, int);
+static int	trieindexpart(int);
+
+#define PARTMAX (27+31)		/* latin-1-specific sizes */
+#define TRIEMAX (PARTMAX*PARTMAX+PARTMAX)
+
+static char	pats[50000];	/* size ought to be computed dynamically */
+static char	*nextpat = pats;
+static char	*trie[TRIEMAX];
+
+int texhyphen(void)
+{
+	static int loaded = 0;		/* -1: couldn't find tex file */
+
+	if (hyphalg == 0 || loaded == -1)	/* non-zero => tex for now */
+		return 0;
+	if (loaded == 0) {
+		if (readpats())
+			loaded = 1;
+		else
+			loaded = -1;
+	}
+	return texit(wdstart, wdend);
+}
+
+static int texit(Tchar *start, Tchar *end)	/* hyphenate as in tex, return # found */
+{
+	int nw, i, k, equal, cnt[500];
+	char w[500+1], *np, *pp, *wp, *xpp, *xwp;
+
+	w[0] = '.';
+	for (nw = 1; start <= end && nw < 500-1; nw++, start++)
+		w[nw] = maplow(cbits(*start));
+	start -= (nw - 1);
+	w[nw++] = '.';
+	w[nw] = 0;
+/*
+ * printf("try %s\n", w);
+*/
+	for (i = 0; i <= nw; i++)
+		cnt[i] = '0';
+
+	for (wp = w; wp+1 < w+nw; wp++) {
+		for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
+			if (pp == 0		/* no trie entry */
+			 || *pp != *wp		/* no match on 1st letter */
+			 || *(pp+1) != *(wp+1))	/* no match on 2nd letter */
+				break;		/*   so move to next letter of word */
+			equal = 1;
+			for (xpp = pp+2, xwp = wp+2; *xpp; )
+				if (*xpp++ != *xwp++) {
+					equal = 0;
+					break;
+				}
+			if (equal) {
+				np = xpp+1;	/* numpat */
+				for (k = wp-w; *np; k++, np++)
+					if (*np > cnt[k])
+						cnt[k] = *np;
+/*
+ * printf("match: %s  %s\n", pp, xpp+1);
+*/
+			}
+			pp += *(pp-1);	/* skip over pattern and numbers to next */
+		}
+	}
+/*
+ * for (i = 0; i < nw; i++) printf("%c", w[i]);
+ * printf("  ");
+ * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
+ * printf("\n");
+*/
+/*
+ * 	for (i = 1; i < nw - 1; i++) {
+ * 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
+ * 			printf("-");
+ * 		if (cbits(start[i-1]) != '.')
+ * 			printf("%c", cbits(start[i-1]));
+ * 	}
+ * 	printf("\n");
+*/
+	for (i = 1; i < nw -1; i++)
+		if (i > 2 && i < nw - 3 && cnt[i] % 2)
+			*hyp++ = start + i - 1;
+	return hyp - hyptr;	/* non-zero if a hyphen was found */
+}
+
+/*
+	This code assumes that hyphen.tex looks like
+		% some comments
+		\patterns{ % more comments
+		pat5ter4ns, 1 per line, SORTED, nothing else
+		}
+		more goo
+		\hyphenation{ % more comments
+		ex-cep-tions, one per line; i ignore this part for now
+		}
+
+	this code is NOT robust against variations.  unfortunately,
+	it looks like every local language version of this file has
+	a different format.  i have also made no provision for weird
+	characters.  sigh.
+*/
+
+static int readpats(void)
+{
+	FILE *fp;
+	char buf[200], buf1[200];
+
+	if ((fp = fopen(TEXHYPHENS, "r")) == NULL
+	 && (fp = fopen(DWBalthyphens, "r")) == NULL) {
+		ERROR "warning: can't find hyphen.tex" WARN;
+		return 0;
+	}
+
+	while (fgets(buf, sizeof buf, fp) != NULL) {
+		sscanf(buf, "%s", buf1);
+		if (strcmp(buf1, "\\patterns{") == 0)
+			break;
+	}
+	while (fgets(buf, sizeof buf, fp) != NULL) {
+		if (buf[0] == '}')
+			break;
+		install(buf);
+	}
+	fclose(fp);
+	fixup();
+	return 1;
+}
+
+static void install(char *s)	/* map ab4c5de to: 12 abcde \0 00405 \0 */
+{
+	int npat, lastpat;
+	char num[500], *onextpat = nextpat;
+
+	num[0] = '0';
+	*nextpat++ = ' ';	/* fill in with count later */
+	for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
+		if (isdigit(*s)) {
+			num[npat] = *s;
+			lastpat = npat;
+		} else {
+			*nextpat++ = *s;
+			npat++;
+			num[npat] = '0';
+		}
+	}
+	*nextpat++ = 0;
+	if (nextpat > pats + sizeof(pats)-20) {
+		ERROR "tex hyphenation table overflow, tail end ignored" WARN;
+		nextpat = onextpat;
+	}
+	num[lastpat+1] = 0;
+	strcat(nextpat, num);
+	nextpat += strlen(nextpat) + 1;
+}
+
+static void fixup(void)	/* build indexes of where . a b c ... start */
+{
+	char *p, *lastc;
+	int n;
+
+	for (lastc = pats, p = pats+1; p < nextpat; p++)
+		if (*p == ' ') {
+			*lastc = p - lastc;
+			lastc = p;
+		}
+	*lastc = p - lastc;
+	for (p = pats+1; p < nextpat; ) {
+		n = trieindex(p[0], p[1]);
+		if (trie[n] == 0)
+			trie[n] = p;
+		p += p[-1];
+	}
+	/* printf("pats = %d\n", nextpat - pats); */
+}
+
+static int trieindex(int d1, int d2)
+{
+	int i;
+
+	i = PARTMAX*trieindexpart(d1) + trieindexpart(d2);
+	if (!(0 <= i && i < TRIEMAX)) {
+		fprintf(stderr, "i = %d\n", i);
+		fprintf(stderr, "d1 = %x = %d\n", d1, d1);
+		fprintf(stderr, "d2 = %x = %d\n", d2, d2);
+		fprintf(stderr, "part(d1) = %x = %d\n", trieindexpart(d1), trieindexpart(d1));
+		fprintf(stderr, "part(d2) = %x = %d\n", trieindexpart(d2), trieindexpart(d2));
+		assert(0);
+	}
+	return i;
+}
+
+static int trieindexpart(int d)
+{
+	if (cbits(d) == '.') return 0;
+	if (cbits(d) <= 'z') return d - 'a' + 1;
+	else return latcbits(d) - 0xE0 + 27; /* L'à' comes after 'z' */
+}
diff --git a/patch/troff-hyphenate-latin-1-fixed/n8.c.orig b/patch/troff-hyphenate-latin-1-fixed/n8.c.orig
new file mode 100644
index 0000000..8c2112c
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/n8.c.orig
@@ -0,0 +1,545 @@
+#include "tdef.h"
+#include "fns.h"
+#include "ext.h"
+#include <assert.h>
+
+#define	HY_BIT	0200	/* stuff in here only works for 7-bit ascii */
+			/* this value is used (as a literal) in suftab.c */
+			/* to encode possible hyphenation points in suffixes. */
+			/* it could be changed, by widening the tables */
+			/* to be shorts instead of chars. */
+
+/*
+ * troff8.c
+ *
+ * hyphenation
+ */
+
+int	hexsize = 0;		/* hyphenation exception list size */
+char	*hbufp = NULL;		/* base of list */
+char	*nexth = NULL;		/* first free slot in list */
+Tchar	*hyend;
+
+#define THRESH 160 		/* digram goodness threshold */
+int	thresh = THRESH;
+
+int	texhyphen(void);
+static	int	alpha(Tchar);
+
+void hyphen(Tchar *wp)
+{
+	int j;
+	Tchar *i;
+
+	i = wp;
+	while (punct((*i++)))
+		;
+	if (!alpha(*--i))
+		return;
+	wdstart = i++;
+	while (alpha(*i++))
+		;
+	hyend = wdend = --i - 1;
+	while (punct((*i++)))
+		;
+	if (*--i)
+		return;
+	if (wdend - wdstart < 4)	/* 4 chars is too short to hyphenate */
+		return;
+	hyp = hyptr;
+	*hyp = 0;
+	hyoff = 2;
+
+	/* for now, try exceptions first, then tex (if hyphalg is non-zero),
+	   then suffix and digram if tex didn't hyphenate it at all.
+	*/
+
+	if (!exword() && !texhyphen() && !suffix())
+		digram();
+
+	/* this appears to sort hyphenation points into increasing order */
+	*hyp++ = 0;
+	if (*hyptr)
+		for (j = 1; j; ) {
+			j = 0;
+			for (hyp = hyptr + 1; *hyp != 0; hyp++) {
+				if (*(hyp - 1) > *hyp) {
+					j++;
+					i = *hyp;
+					*hyp = *(hyp - 1);
+					*(hyp - 1) = i;
+				}
+			}
+		}
+}
+
+static alpha(Tchar i)	/* non-zero if really alphabetic */
+{
+	if (ismot(i))
+		return 0;
+	else if (cbits(i) >= ALPHABET)	/* this isn't very elegant, but there's */
+		return 0;		/* no good way to make sure i is in range for */
+	else				/* the call of isalpha */
+		return isalpha(cbits(i));
+}
+
+
+punct(Tchar i)
+{
+	if (!i || alpha(i))
+		return(0);
+	else
+		return(1);
+}
+
+
+void caseha(void)	/* set hyphenation algorithm */
+{
+	hyphalg = HYPHALG;
+	if (skip())
+		return;
+	noscale++;
+	hyphalg = atoi0();
+	noscale = 0;
+}
+
+
+void caseht(void)	/* set hyphenation threshold;  not in manual! */
+{
+	thresh = THRESH;
+	if (skip())
+		return;
+	noscale++;
+	thresh = atoi0();
+	noscale = 0;
+}
+
+
+char *growh(char *where)
+{
+	char *new;
+
+	hexsize += NHEX;
+	if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
+		return NULL;
+	if (new == hbufp) {
+		return where;
+	} else {
+		int diff;
+		diff = where - hbufp;
+		hbufp = new;
+		return new + diff;
+	}
+}
+
+
+void casehw(void)
+{
+	int i, k;
+	char *j;
+	Tchar t;
+
+	if (nexth == NULL) {
+		if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
+			ERROR "No space for exception word list." WARN;
+			return;
+		}
+		hexsize = NHEX;
+	}
+	k = 0;
+	while (!skip()) {
+		if ((j = nexth) >= hbufp + hexsize - 2)
+			if ((j = nexth = growh(j)) == NULL)
+				goto full;
+		for (;;) {
+			if (ismot(t = getch()))
+				continue;
+			i = cbits(t);
+			if (i == ' ' || i == '\n') {
+				*j++ = 0;
+				nexth = j;
+				*j = 0;
+				if (i == ' ')
+					break;
+				else
+					return;
+			}
+			if (i == '-') {
+				k = HY_BIT;
+				continue;
+			}
+			*j++ = maplow(i) | k;
+			k = 0;
+			if (j >= hbufp + hexsize - 2)
+				if ((j = growh(j)) == NULL)
+					goto full;
+		}
+	}
+	return;
+full:
+	ERROR "Cannot grow exception word list." WARN;
+	*nexth = 0;
+}
+
+
+int exword(void)
+{
+	Tchar *w;
+	char *e, *save;
+
+	e = hbufp;
+	while (1) {
+		save = e;
+		if (e == NULL || *e == 0)
+			return(0);
+		w = wdstart;
+		while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
+			e++;
+			w++;
+		}
+		if (!*e) {
+			if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
+				w = wdstart;
+				for (e = save; *e; e++) {
+					if (*e & HY_BIT)
+						*hyp++ = w;
+					if (hyp > hyptr + NHYP - 1)
+						hyp = hyptr + NHYP - 1;
+					w++;
+				}
+				return(1);
+			} else {
+				e++;
+				continue;
+			}
+		} else
+			while (*e++)
+				;
+	}
+}
+
+
+suffix(void)
+{
+	Tchar *w;
+	char *s, *s0;
+	Tchar i;
+	extern char *suftab[];
+
+again:
+	i = cbits(*hyend);
+	if (!alpha(i))
+		return(0);
+	if (i < 'a')
+		i -= 'A' - 'a';
+	if ((s0 = suftab[i-'a']) == 0)
+		return(0);
+	for (;;) {
+		if ((i = *s0 & 017) == 0)
+			return(0);
+		s = s0 + i - 1;
+		w = hyend - 1;
+		while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
+			s--;
+			w--;
+		}
+		if (s == s0)
+			break;
+		s0 += i;
+	}
+	s = s0 + i - 1;
+	w = hyend;
+	if (*s0 & HY_BIT)
+		goto mark;
+	while (s > s0) {
+		w--;
+		if (*s-- & HY_BIT) {
+mark:
+			hyend = w - 1;
+			if (*s0 & 0100)	/* 0100 used in suftab to encode something too */
+				continue;
+			if (!chkvow(w))
+				return(0);
+			*hyp++ = w;
+		}
+	}
+	if (*s0 & 040)
+		return(0);
+	if (exword())
+		return(1);
+	goto again;
+}
+
+
+maplow(int i)
+{
+	if (isupper(i))
+		i = tolower(i);
+	return(i);
+}
+
+
+vowel(int i)
+{
+	switch (i) {
+	case 'a': case 'A':
+	case 'e': case 'E':
+	case 'i': case 'I':
+	case 'o': case 'O':
+	case 'u': case 'U':
+	case 'y': case 'Y':
+		return(1);
+	default:
+		return(0);
+	}
+}
+
+
+Tchar *chkvow(Tchar *w)
+{
+	while (--w >= wdstart)
+		if (vowel(cbits(*w)))
+			return(w);
+	return(0);
+}
+
+
+void digram(void)
+{
+	Tchar *w;
+	int val;
+	Tchar *nhyend, *maxw;
+	int maxval;
+	extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
+
+again:
+	if (!(w = chkvow(hyend + 1)))
+		return;
+	hyend = w;
+	if (!(w = chkvow(hyend)))
+		return;
+	nhyend = w;
+	maxval = 0;
+	w--;
+	while (++w < hyend && w < wdend - 1) {
+		val = 1;
+		if (w == wdstart)
+			val *= dilook('a', cbits(*w), bxh);
+		else if (w == wdstart + 1)
+			val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
+		else
+			val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
+		val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
+		val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
+		if (val > maxval) {
+			maxval = val;
+			maxw = w + 1;
+		}
+	}
+	hyend = nhyend;
+	if (maxval > thresh)
+		*hyp++ = maxw;
+	goto again;
+}
+
+
+dilook(int a, int b, char t[26][13])
+{
+	int i, j;
+
+	i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
+	if (!(j & 01))
+		i >>= 4;
+	return(i & 017);
+}
+
+
+/* here beginneth the tex hyphenation code, as interpreted freely */
+/* the main difference is that there is no attempt to squeeze space */
+/* as tightly at tex does. */
+
+static int	texit(Tchar *, Tchar *);
+static int	readpats(void);
+static void	install(char *);
+static void	fixup(void);
+static int	trieindex(int, int);
+
+static char	pats[50000];	/* size ought to be computed dynamically */
+static char	*nextpat = pats;
+static char	*trie[27*27];	/* english-specific sizes */
+
+int texhyphen(void)
+{
+	static int loaded = 0;		/* -1: couldn't find tex file */
+
+	if (hyphalg == 0 || loaded == -1)	/* non-zero => tex for now */
+		return 0;
+	if (loaded == 0) {
+		if (readpats())
+			loaded = 1;
+		else
+			loaded = -1;
+	}
+	return texit(wdstart, wdend);
+}
+
+static int texit(Tchar *start, Tchar *end)	/* hyphenate as in tex, return # found */
+{
+	int nw, i, k, equal, cnt[500];
+	char w[500+1], *np, *pp, *wp, *xpp, *xwp;
+
+	w[0] = '.';
+	for (nw = 1; start <= end && nw < 500-1; nw++, start++)
+		w[nw] = maplow(tolower(cbits(*start)));
+	start -= (nw - 1);
+	w[nw++] = '.';
+	w[nw] = 0;
+/*
+ * printf("try %s\n", w);
+*/
+	for (i = 0; i <= nw; i++)
+		cnt[i] = '0';
+
+	for (wp = w; wp+1 < w+nw; wp++) {
+		for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
+			if (pp == 0		/* no trie entry */
+			 || *pp != *wp		/* no match on 1st letter */
+			 || *(pp+1) != *(wp+1))	/* no match on 2nd letter */
+				break;		/*   so move to next letter of word */
+			equal = 1;
+			for (xpp = pp+2, xwp = wp+2; *xpp; )
+				if (*xpp++ != *xwp++) {
+					equal = 0;
+					break;
+				}
+			if (equal) {
+				np = xpp+1;	/* numpat */
+				for (k = wp-w; *np; k++, np++)
+					if (*np > cnt[k])
+						cnt[k] = *np;
+/*
+ * printf("match: %s  %s\n", pp, xpp+1);
+*/
+			}
+			pp += *(pp-1);	/* skip over pattern and numbers to next */
+		}
+	}
+/*
+ * for (i = 0; i < nw; i++) printf("%c", w[i]);
+ * printf("  ");
+ * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
+ * printf("\n");
+*/
+/*
+ * 	for (i = 1; i < nw - 1; i++) {
+ * 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
+ * 			printf("-");
+ * 		if (cbits(start[i-1]) != '.')
+ * 			printf("%c", cbits(start[i-1]));
+ * 	}
+ * 	printf("\n");
+*/
+	for (i = 1; i < nw -1; i++)
+		if (i > 2 && i < nw - 3 && cnt[i] % 2)
+			*hyp++ = start + i - 1;
+	return hyp - hyptr;	/* non-zero if a hyphen was found */
+}
+
+/*
+	This code assumes that hyphen.tex looks like
+		% some comments
+		\patterns{ % more comments
+		pat5ter4ns, 1 per line, SORTED, nothing else
+		}
+		more goo
+		\hyphenation{ % more comments
+		ex-cep-tions, one per line; i ignore this part for now
+		}
+
+	this code is NOT robust against variations.  unfortunately,
+	it looks like every local language version of this file has
+	a different format.  i have also made no provision for weird
+	characters.  sigh.
+*/
+
+static int readpats(void)
+{
+	FILE *fp;
+	char buf[200], buf1[200];
+
+	if ((fp = fopen(TEXHYPHENS, "r")) == NULL
+	 && (fp = fopen(DWBalthyphens, "r")) == NULL) {
+		ERROR "warning: can't find hyphen.tex" WARN;
+		return 0;
+	}
+
+	while (fgets(buf, sizeof buf, fp) != NULL) {
+		sscanf(buf, "%s", buf1);
+		if (strcmp(buf1, "\\patterns{") == 0)
+			break;
+	}
+	while (fgets(buf, sizeof buf, fp) != NULL) {
+		if (buf[0] == '}')
+			break;
+		install(buf);
+	}
+	fclose(fp);
+	fixup();
+	return 1;
+}
+
+static void install(char *s)	/* map ab4c5de to: 12 abcde \0 00405 \0 */
+{
+	int npat, lastpat;
+	char num[500], *onextpat = nextpat;
+
+	num[0] = '0';
+	*nextpat++ = ' ';	/* fill in with count later */
+	for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
+		if (isdigit(*s)) {
+			num[npat] = *s;
+			lastpat = npat;
+		} else {
+			*nextpat++ = *s;
+			npat++;
+			num[npat] = '0';
+		}
+	}
+	*nextpat++ = 0;
+	if (nextpat > pats + sizeof(pats)-20) {
+		ERROR "tex hyphenation table overflow, tail end ignored" WARN;
+		nextpat = onextpat;
+	}
+	num[lastpat+1] = 0;
+	strcat(nextpat, num);
+	nextpat += strlen(nextpat) + 1;
+}
+
+static void fixup(void)	/* build indexes of where . a b c ... start */
+{
+	char *p, *lastc;
+	int n;
+
+	for (lastc = pats, p = pats+1; p < nextpat; p++)
+		if (*p == ' ') {
+			*lastc = p - lastc;
+			lastc = p;
+		}
+	*lastc = p - lastc;
+	for (p = pats+1; p < nextpat; ) {
+		n = trieindex(p[0], p[1]);
+		if (trie[n] == 0)
+			trie[n] = p;
+		p += p[-1];
+	}
+	/* printf("pats = %d\n", nextpat - pats); */
+}
+
+static int trieindex(int d1, int d2)
+{
+	int i;
+
+	i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
+	assert(0 <= i && i < 27*27);
+	return i;
+}
diff --git a/patch/troff-hyphenate-latin-1-fixed/notes b/patch/troff-hyphenate-latin-1-fixed/notes
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/notes
diff --git a/patch/troff-hyphenate-latin-1-fixed/readme b/patch/troff-hyphenate-latin-1-fixed/readme
new file mode 100644
index 0000000..0d3bd32
--- /dev/null
+++ b/patch/troff-hyphenate-latin-1-fixed/readme
@@ -0,0 +1,9 @@
+Add support for Latin-1 characters in hyphenation algorithm
+
+Ideally, it would support UTF-8, but Latin-1 is a big improvement
+over ASCII nonetheless.
+
+This patch disables the suffix function, which I haven't (yet) gotten
+to work with Latin-1.  Suffix is, however, hard-coded for English,
+and, in any case, I'm not sure how important it is.  In the meantime,
+it isn't a huge loss.
author	John Ankarström <john@ankarstrom.se>	2021-01-28 21:33:30 +0000
committer	John Ankarström <john@ankarstrom.se>	2021-01-28 21:33:30 +0000
commit	51b5b02f52cc6f314029f3f2fe97afdc26ba0f25 (patch)
tree	711d1efadeb78ad9b2d9f7b629350358838bc626 /patch/troff-hyphenate-latin-1-fixed
parent	f3fd330cddade1c66d0f101d5cc6f657c4cd1bb6 (diff)
download	plan9-51b5b02f52cc6f314029f3f2fe97afdc26ba0f25.tar.gz