substitution driven nock interpreter

substitution driven nock interpreter
return

Given the nock “specification” is presented as the following:

Nock 4K

A noun is an atom or a cell. An atom is a natural number. A cell is an ordered pair of nouns.

Reduce by the first matching pattern; variables match any noun.

nock(a)             *a
[a b c]             [a [b c]]

?[a b]              0
?a                  1
+[a b]              +[a b]
+a                  1 + a
=[a a]              0
=[a b]              1

/[1 a]              a
/[2 a b]            a
/[3 a b]            b
/[(a + a) b]        /[2 /[a b]]
/[(a + a + 1) b]    /[3 /[a b]]
/a                  /a

#[1 a b]            a
#[(a + a) b c]      #[a [b /[(a + a + 1) c]] c]
#[(a + a + 1) b c]  #[a [/[(a + a) c] b] c]
#a                  #a

*[a [b c] d]        [*[a b c] *[a d]]

*[a 0 b]            /[b a]
*[a 1 b]            b
*[a 2 b c]          *[*[a b] *[a c]]
*[a 3 b]            ?*[a b]
*[a 4 b]            +*[a b]
*[a 5 b c]          =[*[a b] *[a c]]

*[a 6 b c d]        *[a *[[c d] 0 *[[2 3] 0 *[a 4 4 b]]]]
*[a 7 b c]          *[*[a b] c]
*[a 8 b c]          *[[*[a b] a] c]
*[a 9 b c]          *[*[a c] 2 [0 1] 0 b]
*[a 10 [b c] d]     #[b *[a c] *[a d]]

*[a 11 [b c] d]     *[[*[a c] *[a d]] 0 3]
*[a 11 b c]         *[a c]

*a                  *a

I thought it would be cute to try and implement a interpreter that works entirely through textual substitution, especially because the C interpreter given in the documentation is not standalone. The evaluation order is somewhat murky in the documentation, and my ability with C leaves a bit to be desired, so it took a little bit longer than I anticipated..

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>

char *rule[][2] = {
    {"[a b `]", "[a [b `]]"},

    {"+0", "1"},
    {"+0a", "1a"},

    {"/[1 a]", "a"},
    {"/[2 [a b]]", "a"},
    {"/[3 [a b]]", "b"},
    {"/[0aa b]", "/[2 /[0a b]]"},
    {"/[1aa b]", "/[3 /[0a b]]"},

    {"#[1 [a b]]", "a"},
    {"#[0aa [b c]]", "#[0a [b /[1aa c]] c]"},
    {"#[1aa [b c]]", "#[0a [/[0aa c] b] c]"},

    {"*[a [[b c] d]]",      "[*[a b c] *[a d]]"},
    {"*[a [0 b]]",          "/[b a]"},
    {"*[a [1 b]]",          "b"},
    {"*[a [2 [b c]]]",      "*[*[a b] *[a c]]"},
    {"*[a [3 b]]",          "?*[a b]"},
    {"*[a [4 b]]",          "+*[a b]"},
    {"*[a [5 [b c]]]",      "=[*[a b] *[a c]]"},
    {"*[a [6 [b [c d]]]]",  "*[a *[[c d] 0 *[[2 3] 0 *[a 4 4 b]]]]"},
    {"*[a [7 [b c]]]",      "*[*[a b] c]"},
    {"*[a [8 [b c]]]",      "*[[*[a b] a] c]"},
    {"*[a [9 [b c]]]",      "*[*[a c] 2 [0 1] 0 b]"},
    {"*[a [10 [[b c] d]]]", "#[b *[a c] *[a d]]"},
    {"*[a [11 [[b c] d]]]", "*[[*[a c] *[a d]] 0 3]"},
    {"*[a [11 [b c]]]",     "*[a c]"},

    {"=[a a]", "0"},
    {"=[a b]", "1"},
    {"?[a b]", "0"},
    {"?a", "1"},

    {}
};

static char *B, *b;

char *right(char *x) {
    int d = 0;
    while (*x == ' ') x++;
    while (*x) {
        if ((*x == 0 || *x == ' ' || *x == ']') && d == 0) return x;
        else if (*x == '[') d++;
        else if (*x == ']') d--;
        x++;
    }
    return x;
}

char *subst(char *x, char *y, char *p) {
    char *s[5] = {0}, *e[5] = {0}, *P = p, *r = b, i;

    while (*x && (i = *x++) && *p)
        if (i >= 'a' && i <= 'd') {
            char k = i - '`';
            char *r = right(p);
            if (s[k] && e[k] == p) {
                if ((e[k] - s[k]) % 2) return 0;
                e[k] -= (e[k] - s[k])/2;
                continue;
            }
            if (s[k] && (e[k] - s[k] != r - p || memcmp(p, s[k], r - p) != 0))
                return 0;
            s[k] = p; e[k] = p = r;
        } else if (i == '`') {
            s[0] = p;
            while (*p && *p != ']') p = right(p);
            e[0] = p;
        } else if (i != *p++) return 0;
    if (*x || *p) return 0; 

    while (*y) {
        if (*y >= '`' && *y <= 'd') {
            char k = *y - '`';
            memcpy(b, s[k], e[k] - s[k]);
            b += e[k] - s[k]; y++;
        } else *b++ = *y++;
    }
    *b++ = 0;
    return r;
}

char *treesubst(char *x, char *y, char *p) {
    char *P = p;
    while (*p) {
        if (*p == ' ' || *p == ']') p++;
        else {
            char *s = p, *e = right(p), c = *e, *m;
            *e = 0; m = subst(x, y, s); *e = c;
            if (m) {
                char *r = b;
                b += 1 + sprintf(b, "%.*s%s%s", (int)(s-P), P, m, e);
                return r;
            }
            p = *p == 'N' ? e : p + 1;
        }
    }
    return 0;
}

char *eval(char *x) {
    char *y;
    do {
        memmove(B, x, strlen(x)+1);
        x = B; b = B + strlen(x)+1; y = 0;
        for (int i = 0; !y && *rule[i]; i++)
            if (y = treesubst(rule[i][0], rule[i][1], x))
                x = y;
    } while (y);
    
    return x;
}

char *un(char *p) {
    char *r = b;
    while (*p) {
        if ('0' <= *p && *p <= '9') {
            int x = 0;
            while ('0' <= *p && *p <= '9') x = x*10+*p++-'0';
            while (1+x--) *b++ = 'N';
        } else *b++ = *p++;
    }
    *b++ = 0;
    return r;
}

char *dec(char *p) {
    char *r = b;
    while (*p) {
        char *P = p;
        while (*p == 'N') p++;
        if (p-P) b += sprintf(b,"%ld",p-P-1);
        else *b++ = *p++;
    }
    *b++ = 0;
    return r;
}

int main(int argc, char **argv) {
    // someone told me memory safety is important..
    long ps = sysconf(_SC_PAGESIZE);
    B = b = mmap(NULL, ps + (1l << 32), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (B == MAP_FAILED) { perror("mmap"); return 1; }
    if (mprotect(B + (1l << 32), ps, PROT_NONE) == -1) {
        perror("mprotect"); return 1;
    }

    for (int i = 0; *rule[i]; i++) {
        rule[i][0] = strdup(un(rule[i][0]));
        rule[i][1] = strdup(un(rule[i][1]));
    }

    printf("%s\n", dec(eval(strdup(un(argv[1])))));
    return 0;
}

While this approach is obviously inefficient, it’s not really that much less practical than any other interpreter that doesn’t have appropriate “jets” in urbit parlance, and a bunch of nice things fall out of the textual model for free:

Handling arbitrarily large natural numbers is trivial. Though in practice you are severely limited by memory, but the amount of time needed to construct a number large enough to exhaust memory on a modern computer makes it a moot point anyway. If one did care about such a thing, storing the program in an RLE buffer would solve the problem cleanly.
No need for explicit reference counting or garbage collection.
No need to use recursion (or cheating with a stack) to traverse the AST, the evaluator itself uses a constant amount of memory with the exception of the size of the output string.

With regards to nock itself, it’s unclear to me why this was chosen as an IR format over something more amenable to performant execution, or something more ideologically pure like SK combinator calculus (or iota/jot etc.), since neither are going to be remotely fast without significant idiom recognition and SK combinator calculus is very widely understood and requires minimal specification effort. I must admit it does make me a bit curious about trying to JIT compile it though..

I’m usually not a fan of requiring idiom recognition for performance, but I think I’ve come around a bit now. Originally, I would have told you that it was just “hiding the real specification”, but in this case the nock code itself serves as an exhaustive specification, and changes a specification problem into a distribution issue. That being said, I think not requiring jets to be accompanied by a formal proof of correspondence is a bit weak.