From c3143fbdb7ea46539023e11cb30a7b14434030f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= Date: Fri, 23 Oct 2020 02:11:59 +0200 Subject: release 1.2 --- doc/index.html | 802 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 802 insertions(+) create mode 100644 doc/index.html (limited to 'doc/index.html') diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000..4b0c068 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,802 @@ + + + +tt.c + + +

tt.c

+

tt.c -- tangle to, written by John Ankarström → tt.c

+ +
#include <stdio.h>
+
+#ifdef _WIN32
+#include <shlwapi.h>
+#pragma comment(lib, "Shlwapi.lib")
+#else
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+#include <strings.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#endif
+
+<<definitions>>
+
+#define err(code, string) do { fprintf(stderr, "%s: %s: %s\n", string, strerror(errno)); exit(code); } while (0)
+#define die(...) do { fprintf(stderr, __VA_ARGS__); exit(1); } while (0)
+#define true 1
+#define false 0
+#define bool int
+
+<<declarations>>
+
+int main(int argc, char *argv[]) {
+  <<main.declarations>>
+  <<main.globals>>
+  <<main.options>>
+  <<main.input>>
+  <<main.output>>
+  return 0;
+}
+
+void reference(char *line) {
+  <<reference.declarations>>
+  <<reference.parse>>
+  <<reference.add>>
+}
+
+bool insertion(char *line) {
+  <<insertion.declarations>>
+  <<insertion.parse>>
+  <<insertion.add>>
+}
+
+
+

tt.input.c

+

References in source input

+ +

The references found in the source input is stored as an array of strings +in the global refs variable: → declarations

+ +
char **refs; /* references */
+int refs_c; /* count */
+int refs_s; /* size (number of elements allocated for) */
+
+ +

It is allocated at the beginning of the execution to contain an array of +ten strings. The refss variable keeps track of the amount of allocated +space, while refc holds the number of actual elements: → main.globals

+ +
  refs_c = 0;
+  refs_s = 10;
+  refs = malloc(refs_s * sizeof(char *));
+  if (refs == NULL) err(1, "malloc");
+
+ +

Insertions in source input

+ +

tt represents every insertion as an array of strings, where each string +corresponds to a line to be inserted. All insertions are stored in the +global ins array: → declarations

+ +
char ***ins; /* insertions */
+
+ +

The position of each insertion in the ins array is always equal to the +position of the corresponding reference in the refs array -- to find what +lines should be inserted at destination X, one must find the value P such +that refs[P] is equal to X. Then, the corresponding insertion will be equal +to ins[P].

+ +

In other words, the ins array should always be of the same length as refs. +As such, the refss and refsc variables are used for ins as well. The ins +array is allocated to hold the same number of elements as refs. Furthermore, +its elements are set to NULL, signifying the absence of any insertion at +that index: → main.globals

+ +
  ins = malloc(refs_s * sizeof(char **));
+  if (ins == NULL) err(1, "malloc");
+  for (i = 0; i < refs_s; i++)
+    ins[i] = NULL;
+
+ +

→ main.declarations

+ +
int i;
+
+ +

Parsing standard input

+ +

Text is read from the standard input, line by line, into a line variable. +Two additional variables, lines and linel, keep track of the amount of +allocated space and the actual number of characters in the string, +respectively: → main.declarations

+ +
  char *line;
+  int line_l; /* length */
+  int line_s; /* size (number of characters allocated for) */
+
+ +

It initially is allocated to hold 100 characters: → main.input

+ +
  line_l = 0;
+  line_s = 100;
+  line = malloc(1 + line_s * sizeof(char));
+  if (line == NULL) err(1, "malloc");
+
+ +

Lines are read character by character until end of file. First, the read +character is assigned to the variable b. When it is certain that it is not +EOF, then it is assigned to the variable c: → main.input

+ +
  while ((b = getchar()) != EOF) {  
+    c = b;
+
+ +

→ main.declarations

+ +
  char b;
+  char c;
+
+ +

On every iteration, tt checks whether the read character is a newline. If +not, the character is added to the line variable, which is re-allocated if +necessary. The line_l, keeping track of the line's length, is incremented +as well: → main.input

+ +
    if (c != '\n') {
+      if (line_l + 1 > line_s) {
+        line_s += 20;
+        tmp = realloc(line, 1 + line_s * sizeof(char));
+        if (tmp == NULL) err(1, "malloc");
+        line = tmp;
+      }
+      line[line_l++] = c;
+      continue;
+    }
+
+ +

The tmp variable used in the re-allocation has a type which is identical to +that of the line variable: → main.declarations

+ +
  char *tmp;
+
+ +

If the read character is a newline, then the program "finishes" the line, +adding a final NULL character and resetting line_l: → main.input

+ +
finish:
+    line[line_l] = '\0';
+    line_l = 0;
+
+ +

Before parsing the line, we make sure to skip it if it is empty and +following a non-code line: → main.input

+ +
    if (strlen(code_prefix) == 0 && !wascode && strcmp(line, "") == 0) {
+      continue;
+    }
+
+ +

This is only desirable if CODE_PREFIX is empty, because then, there is no +way for the writer of the source input to, for appearance's sake, leave an +empty line between non-code lines and code lines; any empty line will +will inevitably be interpreted as a code line. The code above circumvents +this.

+ +

This aesthetical nicety requires the program to keep track of whether the +previous line was a code line or not: → main.declarations

+ +
  bool wascode = false;
+
+ +

Now, it is time to check whether the read line is a code line (an insertion) +or a documentation line (containing a reference): → main.input

+ +
    if (!insertion(line)) reference(line);
+  }
+
+ +

The insertion and reference functions modify the ins and refs variables +according to the contents of the line.

+ +

Finally, after the loop is finished -- meaning that EOF has been reached -- +we must ensure that the final character was not a newline; otherwise, the +final line of source input has not been processed, as lines are processed +only when the terminated newline is encountered.

+ +

Thus, if the final character was a newline, tt goes back and finishes the +final line: → main.input

+ +
  if (c != '\n') { c = '\n'; goto finish; }
+
+ +

Identifying and processing documentation lines containing references

+ +

The reference function is responsible for processing references in source +input lines: → declarations

+ +
void reference(char *line);
+
+ +

Parsing the line

+ +

Documentation lines are formatted as follows: ->

+ +
documentation line ::= DOC_PREFIX anything [reference]
+
+reference ::= "->" [whitespace] identifier [whitespace]
+identifier ::= not whitespace
+
+ +

In order to identify whether a given line actually is a documentation line +containing a reference, the line variable is aliased to ln, which will be +modified instead of line: → reference.declarations

+ +
  char *ln = line;
+
+ +

First, we ensure the line begins with the doc_prefix: → reference.parse

+ +
  if (strncmp(ln, doc_prefix, strlen(doc_prefix)) != 0) return;
+
+ +

Then, we ensure that a hyphen is present: → reference.parse

+ +
hyphen:
+  if (*ln == '\0') return;
+  else if (*ln == '-') { ln++; goto lessthan; }
+  else { ln++; goto hyphen; }
+
+ +

After finding the hyphen, we check whether a less-then sign follows it. +If not, we keep looking for another hyphen. → reference.parse

+ +
lessthan:
+  if (*ln != '>') goto hyphen;
+  else ln++;
+
+ +

After finding a less-then sign following a hyphen (->), we ignore all +whitespace, if there is any. If the end of the line has been reached, or is +reached, by this point, then it will be interpreted as an empty reference, +resetting the current reference (meaning that subsequent code lines will not +be attached to any reference): → reference.parse

+ +
space:
+  if (isspace(*ln)) { ln++; goto space; }
+  if (*ln == '\0') { ref = ""; return; }
+
+ +

Now, a valid reference should be a string of non-space characters, +followed optionally by whitespace, but not anything other than whitespace: +→ reference.parse

+ +
  for (i = 0; i < strlen(ln); i++)
+    if (isspace(ln[i])) {
+      for (j = i; j < strlen(ln); j++)
+        if (!isspace(ln[j])) return;
+      break;
+    }
+
+ +

→ reference.declarations

+ +
  int i;
+  int j;
+
+ +

After the loop above, i will be set to the index of the first encountered +space or the end of the line. Any trailing whitespace should be ignored: +→ reference.parse

+ +
  ln[i] = '\0';
+
+ +

Adding the reference

+ +

At this point, we have found a valid reference, which should now be added to +the global refs array.

+ +

First, however, it should be mentioned that reference identifiers have a +maximum length of 80 characters: → definitions

+ +
#define REFMAX 80
+
+ +

Thus, any reference identifier longer than REFMAX is truncated, with a +warning printed to the standard error stream: → reference.add

+ +
  if (strlen(ln) > REFMAX) {
+    fprintf(stderr, "Warning: Truncating identifier exceeding %d characters\n",
+      REFMAX);
+    ln[REFMAX] = '\0';
+  }
+
+ +

It should also be mentioned that the current reference is always stored in a +global variable, from which the code(char ) function knows with which +reference to associate each code line: → declarations*

+ +
char *ref;
+
+ +

It is allocated in the beginning of the program's execution: → main.globals

+ +
  ref = malloc(1 + REFMAX * sizeof(char));
+  if (ref == NULL) err(1, "malloc");
+
+ +

It is freed before the output section of the program, at which point it is +no longer needed: → main.output

+ +
  free(ref);
+
+ +

The variable is set by our reference function: → reference.add

+ +
  sprintf(ref, "%s", ln); /* set current reference */
+  ref[strlen(ln)] = '\0';
+
+ +

Now remains the work of adding the reference to the global refs variable -- +unless it already exists in refs: → reference.add

+ +
  for (i = 0; i < refs_c; i++)
+    if (strcmp(refs[i], ref) == 0) return;
+
+ +

If the reference truly is new, we notify the user: → reference.add

+ +
  fprintf(stderr, "New reference: %s\n", ref);
+
+ +

Before adding the new reference to refs, we re-allocate refs (and therefore +also ins, which should always be as large as refs), if needed: +→ reference.add

+ +
  if (++refs_c > refs_s) {
+    refs_s += 10;
+    tmp = realloc(refs, refs_s * sizeof(char *));
+    if (tmp == NULL) err(1, "malloc");
+    refs = tmp;
+    tmp2 = realloc(ins, refs_s * sizeof(char *));
+    if (tmp2 == NULL) err(1, "malloc");
+    ins = tmp2;
+    for (i = refs_s - 10; i < refs_s; i++) /* TODO: is this right? */
+      ins[i] = NULL;
+  }
+
+ +

→ reference.declarations

+ +
  char **tmp;
+  char ***tmp2;
+
+ +

Notice that the code above also increases the refs_c count. Now, everything +else is done, and the reference is ready to be added: → reference.add

+ +
  refs[refs_c-1] = malloc(1 + REFMAX * sizeof(char));
+  sprintf(refs[refs_c-1], "%s", ref);
+
+ +

Identifying and processing code lines

+ +

The insertion function is responsible for processing code lines: +→ declarations

+ +
bool insertion(char *line);
+
+ +

It returns true if the given line is a code line (i.e., an insertion).

+ +

Parsing the code line

+ +

First of all, if there is no current reference, the insertion should be +ignored: → insertion.parse

+ +
  if (ref[0] == '\0') return false;
+
+ +

If there is a CODEPREFIX, we ensure that the line begins with it. +Likewise, if there is a DOCPREFIX, we ensure that the line does not +begin with it: → insertion.parse

+ +
  if (strlen(code_prefix) > 0)
+    if (strncmp(line, code_prefix, strlen(code_prefix)) != 0) return false;
+  if (strlen(doc_prefix) > 0)
+    if (strncmp(line, doc_prefix, strlen(doc_prefix)) == 0) return false;
+
+ +

As you can see, the DOCPREFIX is given precedence over the CODEPREFIX.

+ +

Adding the code line to the insertions

+ +

Now that we know the line contains an insertion, we must find the index +of the current reference in the refs array: → insertion.add

+ +
  for (i = 0; i < refs_c; i++)
+    if (strcmp(refs[i], ref) == 0) break;
+
+ +

→ insertion.declarations

+ +
  int i;
+
+ +

Our goal is to add the insertion to the corresponding position in the ins +array. If there is no insertion at that position, the value will be NULL: +→ insertion.add

+ +
  if (ins[i] == NULL) {
+    ins[i] = malloc(1 + 1 * sizeof(char *));
+    if (ins[i] == NULL) err(1, "malloc");
+    len = 0;
+  }
+
+ +

If ins[i] is not NULL, then it already contains some number of insertion +strings, terminated by a final NULL value. In order to allocate memory +for the new insertion, we find the position of the final NULL value, +corresponding to the length of the ins[i] array: → insertion.add

+ +
  else {
+    for (len = 0; ins[i][len] != NULL; len++) ;
+    tmp = realloc(ins[i], 1 + (len + 1) * sizeof(char *));
+    if (tmp == NULL) err(1, "malloc");
+    ins[i] = tmp;
+  }
+
+ +

→ insertion.declarations

+ +
  char **tmp;
+  int len;
+
+ +

Now remains adding the insertion to ins[i]. First, we mark the new final +position: → insertion.add

+ +
  ins[i][len + 1] = NULL;
+
+ +

Then, we allocate memory for the string: → insertion.add

+ +
  ins[i][len] = malloc(1 + strlen(line) * sizeof(char));
+  if (ins[i][len] == NULL) err(1, "malloc");
+
+ +

Finally, we copy the string, returning true, signifying that the line +processed indeed was a code line: → insertion.add

+ +
  strncpy(ins[i][len], line + strlen(code_prefix),
+    strlen(line) - strlen(code_prefix));
+  ins[i][len][strlen(line) - strlen(code_prefix)] = '\0';
+  return true;
+
+ +

Notice also that we make sure to skip the CODE_PREFIX.

+
+

tt.options.c

+

Command-line flags

+ +

tt can be configured by changing the value of three variables: +→ declarations

+ +
char *code_prefix; /* string with which code lines should start */
+char *doc_prefix;  /* string with which documentation lines should start */
+char *out_prefix;  /* string to which the output file name should be appended */
+
+ +

The default values are the following: → main.options

+ +
  code_prefix = "    "; /* code lines should begin with four spaces */
+  doc_prefix = "";      /* other lines are documentation lines */
+  out_prefix = "out/";  /* all output files go in the out/ directory */
+
+ +

Each variable is controlled by a single-letter command-line flag, which +should then be immediately -- without any space -- followed by the +desired value. For example, -dfinal. would set out_prefix to "final.".

+ +

This convention allows for a very simple parsing loop: → main.options

+ +
  for (i = 1; i < argc; i++)
+    if (argv[i][0] == '-') {
+      switch(argv[i][1]) {
+        case 'c':
+          code_prefix = argv[i] + 2;
+          break;
+        case 'd':
+          doc_prefix = argv[i] + 2;
+          break;
+        case 'o':
+          out_prefix = argv[i] + 2;
+          break;
+        case '-':
+          i++;
+          goto end;
+        default:
+          die(USAGE);
+      }
+    } else
+      break;
+end:
+
+ +

If the given argument begins with a hyphen, it is interpreted as a flag. +If the flag is --, then tt ignores the argument and stops looking for flags. +If the flag is unrecognized, the program dies. If the argument does not +begin with a hyphen, it and anything following it will not be interpreted +as a flag.

+ +

USAGE contains information about how to use tt: → definitions

+ +
#define USAGE "usage: %s [-cCODE_PREFIX] [-dDOC_PREFIX] [-oOUTPREFIX] destination ...\n", argv[0]
+
+ +

Of course, we can't just trust the user to provide reasonable values, so we +ensure that the codeprefix and outprefix are not identical and that the +out_prefix is not empty -- otherwise, tt would overwrite all destination +files: → main.options

+ +
  if (strcmp(code_prefix, doc_prefix) == 0)
+    die("code_prefix and doc_prefix cannot be identical\n");
+  if (strlen(out_prefix) == 0)
+    die("out_prefix cannot be empty\n");
+
+ +

Command-line arguments

+ +

Having finished parsing command-line flags, it is time to collect the +remaining command-line arguments, which should be one or more destination +files. Our loop above, when broken out of or finished naturally, has set +the i variable to the position of the first non-flag argument in argv (or +simply the position after the last flag in argv).

+ +

First, we check if there actually are any further argument, or if i is past +the end of the array: → main.options

+ +
  if (i == argc) die(USAGE);
+
+ +

At least one destination file is required. Then, we save the position of the +first destination file in argv in a special variable for later use: +→ main.options

+ +
offset = i;
+
+ +

→ main.declarations

+ +
int offset;
+
+ +

Now, we have successfully finished parsing both flags and arguments, and are +ready to read the lines on the standard input.

+
+

tt.output.c

+

Outputting the results

+ +

At this point, we have collected all references and accompanying insertinos +in the source input. Two tasks remain:

+ +
    +
  1. We need to parse the destination files, identifying <>.
  2. +
  3. We need to copy the destination files to the tangled files, overwriting +all <> with the corresponding insertions.
  4. +
+ +

Both of these tasks will be performed in the same loop: → main.output

+ +
  for (k = offset; k < argc; k++) {
+
+ +

→ main.declarations

+ +
  int k;
+
+ +

The counter k is set to the offset defined in the options section, which +should be equal to the position of the first destination file in argv. +We loop as long as we haven't reached the end of argv.

+ +

On each iteration of the loop, we can obtain from argv the name of the +destination file and copy it to a new string, adding the out_prefix. We'll +call this string tangledfilename: → main.declarations

+ +
  char *tangledfilename;
+
+ +

→ main.output

+ +
    tangledfilename = malloc(1 + (strlen(out_prefix) + strlen(argv[k]) + 50) * sizeof(char));
+    if (tangledfilename == NULL) err(1, "malloc");
+
+    if (sprintf(tangledfilename, "%s%s", out_prefix, argv[k]) == -1)
+      err(1, "sprintf");
+
+ +

Now, we can open the tangled file for writing and the original destination +file for reading. We'll call the handle for tangledfile f and the handle for +argv[k] fo, the o standing for "original": → main.declarations

+ +
  FILE *f;
+  FILE *fo;
+
+ +

→ main.output

+ +
    f = fopen(tangledfilename, "w");
+    if (f == NULL) err(1, "fopen");
+    fo = fopen(argv[k], "r");
+    if (fo == NULL) err(1, "fopen");
+
+ +

Having successfully opened the files, we have no need for tangledfilename: +→ main.output

+ +
    free(tangledfilename);
+
+ +

Parsing the current destination file and writing the tangled file

+ +

The destination file will be parsed in a manner similar to the way in which +the source input was parsed. The same structure will be used: → main.output

+ +
    line = "";
+    line_l = 0;
+    /* line_s is remembered */
+
+    while ((b = fgetc(fo)) != EOF) {
+      c = b;
+      if (c != '\n') {
+        if (line_l + 1 > line_s) {
+          line_s += 20;
+          tmp = realloc(line, 1 + line_s * sizeof(char));
+          if (tmp == NULL) err(1, "malloc");
+          line = tmp;
+        }
+        line[line_l++] = c;
+        continue;
+      }
+
+ +

Again, characters will be added to the line variable until a newline is +encountered, at which point the collected line will be finished: +→ main.output

+ +
finish2:
+      line[line_l] = '\0';
+      line_l = 0; /* reset line length count */
+
+ +

From here on, however, the loop will look a bit different. First, tt takes +note of the line's indentation, saving it to the indent variable: +→ main.declarations

+ +
  int indent;
+
+ +

Only spaces are currently supported: → main.output

+ +
      ref = line;
+      for (indent = 0; *ref == ' '; ref++) indent++;
+
+ +

Also, as you can see, we re-use the ref variable that was used by the input +parsing, but which is now unused.

+ +

Parsing the <> is simple: → main.output

+ +
      if (strncmp(ref, "<<", 2) != 0
+        || strncmp(ref + strlen(ref) - 2, ">>", 2) != 0) {
+        fprintf(f, "%s\n", line);
+        continue;
+      }
+
+ +

If no potential destination is found, then the line will be written as-is to +the tangled file, and the loop continues parsing the next line of the file. +If a potential destination is found, however, we store it in the ref +variable, removing the << and >> markers: → main.output

+ +
      ref += 2;
+      ref[strlen(ref) - 2] = '\0';
+
+ +

There is still one thing to check, before we know that the destination is +valid -- it must not contain any whitespace: → main.output

+ +
      for (i = 0; i < strlen(ref); i++)
+        if (isspace(ref[i])) {
+          fprintf(f, "%s\n", line);
+          continue;
+        }
+
+ +

Again, if there is whitespace, then the line does not signify a destination +and should be printed as-is to the resulting tangled file.

+ +

As when parsing the input, long identifiers are truncated: → main.output

+ +
      if (strlen(ref) > REFMAX)
+        fprintf(stderr,
+          "Warning: Truncating identifier exceeding %d characters\n", REFMAX);
+
+ +

Finally, we check whether the destination actually has been referenced by +the source input, warning the user otherwise: → main.output

+ +
      for (i = 0; i < refs_c; i++)
+        if (strncmp(refs[i], ref, REFMAX) == 0) goto found;
+      fprintf(stderr, "Unreferenced destination: %s\n", ref);
+      continue;
+found:
+
+ +

Having established that the identified destination is referenced by the +source input, and having stored in the local i variable the reference's +position in the refs variable, we can retrieve the insertion for the +reference by looking at the same position in the ins variable.

+ +

Our first order of business is to make sure that the insertion is not empty +-- in that case, the user is warned, and the loop goes on to the next line: +→ main.output

+ +
      if (ins[i] == NULL) {
+        fprintf(stderr, "Warning: Insertion for %s is empty\n", ref);
+        continue;
+      }
+
+ +

Now, we are ready to write the insertion for the destination to the tangled +file. Because each insertion is stored as an array of strings, each string +containing a single line of the insertion, we use yet another loop: +→ main.output

+ +
      for (j = 0; ins[i][j] != NULL; j++) {
+        if (ins[i][j + 1] == NULL) {
+          if (strlen(ins[i][j]) == 0)
+            break; /* remove extra newline */
+        }
+        for (m = indent; m > 0; m--) putc(' ', f);
+        fprintf(f, "%s\n", ins[i][j]);
+      }
+    }
+
+ +

→ main.declarations

+ +
int j;
+int m;
+
+ +

Apart from simply printing the inserted line to the tangled file, the code +above also skips any empty line at the end of the insertion and adds the +indentation identified when parsing the line in the destination file +containing the destination identifier.

+ +

Now, we have almost finished parsing the current destination file and +writing to the corresponding tangled file, but -- as before -- we still +haven't processed the final line of the file, if that line ends without +a newline. To fix that, we just run the finishing code again: +→ main.output

+ +
    if (c != '\n') { c = '\n'; goto finish2; }
+
+ +

Finally, we close the handles to the destination file and tangled file: +→ main.output

+ +
    fclose(f);
+    fclose(fo);
+  }
+
+ +

And that is the end of the loop. The loop continues for every destination +file given as an argument, and when it is done, so is the program.

+
+ -- cgit v1.2.3