/*  postFORTH - minimal postscript/FORTH bootstrap
    Copyright (C) 2004 John Comeau <jc@jcomeau.com>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

Virtual Memory Map:
00000000 - 08048000 Reserved
08048000 - 08048074 ELF header information
08048074 - 0804A000 (Approximately) core dictionary
next 4 values dependent on bss_size; they're about right for 0x80000
0804A000 - 080816E4 Extensions to dictionary
080816E4 - 080C16E4 CAs calculated from hashes of the Forth words
080C16E4 - 080C96E8 32K buffer, for block I/O and TIB, followed by null byte
080C96E8 - BFFFF604 (Approximately) Reserved (until kernel-assigned stack)
BFFFED04 - BFFFF604 Return Stack
BFFFF604 - BFFFFE04 Parameter Stack
BFFFFE04 - C0000000 Args, environment, other info passed in by Kernel
C0000000 - FFFFFFFF Reserved

Word structure:
Code word LIT starts at 0x8048074:
 03  4C  49  54  03  00  00  00  00  81  80  04  08  AD  50  EB  0E
  3   L   I   T   3    00000000        08048081      LODSW;PUSH AX;JMP NEXT
 Count and word LIT;   Back Link;   Code Address     PFA, runtime LIT code

Design goals:
1. No more than 400 lines of source, to make it extremely easy to port
2. Deferred binding, to make it more like postscript and to allow for
 top-down design of programs, especially the compiler itself.
3. Only the essential words in assembly, meaning that this bootstrap code
 need not contain any traditional Forth words. As it turns out, it has many.
4. No more than 1K compiled size (DOS version).
5. Each routine must fit in one DOS EDIT screen, 80*24.
6. Self-documenting, making this file the only one necessary.
7. Forth words callable in Assembly before coldstart complete.
 This Forth switches to high level using CALL DOCOL+4; >CODE = back to code
8. Optimized for porting ease first, runtime speed second, programming ease
 third, conformity with standards a distant fourth.

Revision History:
 8/19/97 could not get it to recognize CRLF as EOL when reading from a file,
 even after setting cooked mode. So I made the buffer size 4K instead of 1K.
 9/14/97 Changed to 32K buffer, made buffersize variable for easier changes.
 2000-03-11 now builds dict the right way by pushing all the NFAs on the
  stack first, then starting at 'LIT'.
 Count words using the following:
 grep '^\(create\|constant\|highlevel\|variable\)' f.s | wc
 2000-09-29 counted 45, 2 more than I thought!
debug=-1 /* change to zero, or comment out, when debugged.
*/
cellsize=4 /* 2 for DOS, 4 for linux */
const=(cellsize*2)+1 /* 5 for DOS, 9 for linux */

    .section .text, "awx", @progbits /* the 'w' seems to be ignored by ld... */
    .global _start
    .equ _start, cold+cellsize /* get rid of legacy 'jmp' from FIG-forth */
/* All words in this Forth are immediate, and it's stateless,
   so there's no need for any flag bits or "smudge".
*/
hashbits=15 /* determines size of dict, (cellsize**hashbits)*2 */
dictsize=cellsize << hashbits /* same as (cellsize**hashbits)*2 */
bitmask=dictsize-1 /* logical-AND with a number to make it fit dictsize */
stacksize=0x800 /* size for parameter and return stacks */
buffersize=0x8000 /* size of screen/disk block */
bss_size=DATASIZE * 0x10000 + stacksize /* size of data section */
/* if using Linux stack, leave 0xff00 at end of input buffer for PARSE */
buffer=bss_size-buffersize-cellsize-stacksize /* add to _edata */
9: /* just so macro doesn't fail on LIT */
.macro create label, word
    backlink_\label=9b
    backlink_lit=0 /* first word has 0 for backlink */
9:  .byte 1f-0f
0:  .ascii "\word"
1:  .byte .-0b
    .long backlink_\label
\label: .long .+cellsize /* primary CA */
    .endm
/* define how code words will be defined */
.macro variable label, word, value /* variable definition */
    create \label,\word
    call nextword+cellsize /* leaves address of variable on the stack */
    .long \value
    .endm
.macro constant label, word, value /* constant definition */
    create \label, \word
    call fetch+cellsize /* same code as DOCON, so why waste another word? */
    .long \value
    .endm
.macro highlevel label, word /* colon definition */
    create \label, \word
    call docol+cellsize
    .endm
.macro next
    jmp nextword+cellsize
    .endm

create lit, "LIT"
    lodsl /* get (IP)+ */
    push %eax
    next
create nextword, "NEXT" /* also serves as DOVAR if CALLed */
    lodsl /* get CA and advance the pointer */
    jmp *(%eax) /* jump to the code procedure that begins every Forth word */
create docol, "DOCOL"
    sub $cellsize,%ebp /* make room on return stack */
    mov %esi,(%ebp) /* save return address */
    pop %esi /* get CA of defined word, most often another DOCOL (!) */
    next /* here we go again */
create unnest, "\x3bS" /* high-level return (compiles to ;S aka semis) */
    mov (%ebp),%esi /* restore address of caller */
    add $cellsize,%ebp /* advance pointer */
    next
create execute, "EXECUTE"
    pop %ebx /* get address of operation to perform */
    jmp *(%ebx) /* go do it */
create tocode, ">CODE"
/* ( addr -) */
/* get address of machine code inline with this highlevel word */
    mov %esi,%ebx
    mov (%ebp),%esi /* restore address of caller, like ;S */
    add $cellsize,%ebp /* adjust return stack */
    jmp *%ebx /* jump directly into code routine */
create store, "!"
/* ( n addr -) /* store n at addr */
    pop %ebx
    popl (%ebx) /* get addr into BX, store n at addr */
    next
create input, "INPUT" /* similar to Forth EXPECT */
    mov (bbuf+const),%edx /* load count and addr from variables */
    mov (buf+const),%ecx
    mov $0x3,%eax /* sys_READ */
    xor %ebx,%ebx /* 0 = Standard Input, STDIN */
    int $0x80 /* syscall */
    cmpl $0xfffff001, %eax
    jae 1f /* handle error */
    or %eax,%eax /* see if we got anything */
    jz 1f /* exit if not */
    jmp 2f /* skip to next Forth instruction */
1:  mov %eax,%ebx /* exit code */
    mov $1,%eax /* _exit syscall */
    int $0x80 /* exit */
2:  next
create count, "COUNT"
/* (cstring - addr count) */
    pop %ebx /* get address of counted string */
    xor %eax,%eax /* clear count */
    mov (%ebx),%al /* get count byte */
    inc %ebx /* point past it */
    push %ebx /* store address */
    push %eax
    next /* store count */
highlevel index, "INDEX"
/* (cstring - index linkaddr) (index into hashtable) */
    .long duplicate,count,hash,duplicate,plus,duplicate,plus
    .long lit,bitmask,mask,dict,fetch,plus,swap,count,plus,oneplus,unnest
/* hash function uses Andy Lowry's algorithm from Kermit book p.257 */
create hash, "HASH" /* (addr count - hash) (hash a counted string) */
    pop %ecx /* get count */
    pop %ebx /* get address of counted string */
    xchg %ebx,%esi /* move it where we can use it best */
    xor %edi,%edi /* clear hash accumulator */
    or %ecx,%ecx /* can't safely use 16-bit instructions jcxz */
    jz 2f /* skip hard part if nothing there */
    push %ebp /* save return stack pointer */
    mov $010201,%ebp /* use it for multiplier */
1:  xor %eax,%eax /* clear MSBs */
    lodsb /* get next char */
    push %eax /* save it */
    xor %edi,%eax /* crc ^ c */
    and $017,%eax /* q = (crc ^ c) & 017; */
    mul %ebp /* q * 010201 */
    shr $4,%edi /* crc >> 4 */
    xor %eax,%edi /* crc = (crc >> 4) ^ (q * 010201); */
    pop %eax /* get c back */
    shr $4,%eax /* high nybble this time */
    xor %edi,%eax /* crc ^ (crc >> 4) */
    and $017,%eax /* q = (crc ^ (c >> 4)) & 017; */
    mul %ebp /* q * 010201 */
    shr $4,%edi /* crc >> 4 */
    xor %eax,%edi /* crc = (crc >> 4) ^ (q * 010201); */
    loop 1b
    pop %ebp /* restore return stack pointer */
2:  push %edi
    xchg %ebx,%esi
    jmp nextword+cellsize
constant buf, "IN", _edata+buffer /* pointer to input buffer */
variable bufptr, ">IN", _edata+buffer /* offset into buffer, FIG=IN */
variable dp, "DP", eod /* current end of dictionary, HERE */
variable latest, "LATEST", enddict /* NFA of last word defined */
variable sp0, "SP0", 0 /* beginning parameter stack pointer */
variable rp0, "RP0", 0 /* return stack pointer */
constant bbuf, "B/BUF", buffersize /* bytes per block and per screen (same) */
constant cell, "CELL", cellsize /* bytes per word (cell) */
constant blank, "S", 32  /* blank, ASCII space character */
constant z, "0", 0 /* 0=stdin */
constant one, "1", 1 /* 1=stdout */
create oneplus, "1+"
    pop %eax /* get arg to increment */
    inc %eax /* do so, then back onto stack */
    push %eax
    next
create fill, "FILL" /* do NOT make high level, needed by COLD */
/* of course that really doesn't matter as long as primary CAs are used */
/* (addr n b -) */
    pop %eax /* fill character */
    pop %ecx /* count */
    pop %edi /* start address */
    rep
    stosb /* fill memory block with fill character */
    next
create parse, "PARSE" /* similar to Forth WORD but doesn't need delimiter */
/* (- addr) /* returns address where counted word was stored */
    mov (bufptr+const),%edi /* pointer into buffer space, Forth IN or >IN */
    mov (blank+const),%al /* <= space (NUL, TAB, CR, etc) are delimiters */
1:  scasb /* go until nondelimiter reached */
    jnb 1b /* loop while blank; depends on nonblank before DI wraps around */
    dec %edi /* point back to first nondelimiter */
    cmp $_edata+buffer+buffersize,%edi /* did we go past buffer? */
    jb 2f /* skip if not */
    mov $_edata+buffer+buffersize,%edi /* point to null just past buffer */
2:  mov %edi,%ebx /* address of string */
3:  scasb /* compare to blank */
    jb 3b /* loop if nonblank */
    dec %edi /* adjust for overshoot */
    mov %edi,(bufptr+const) /* update >IN */
    sub %ebx,%edi /* calculate byte count... */
    mov %edi,%eax /* into AX */
    mov (dp+const),%edi /* get HERE location */
    stosb /* store count byte */
    xchg %esi,%ebx /* save Forth IP while we use SI to store the string */
    mov %eax,%ecx /* copy count where it counts */
    rep
    movsb /* store the string */
    xchg %esi,%ebx /* restore IP before we get careless and mess it up */
    pushl (dp+const) /* pointer to counted word */
    next
variable dict, "DICT", _edata+buffer-dictsize /* buffer for hashes */
create cold, "COLD"
    mov (sp0+const),%eax /* check if SP0 already initialized */
    or %eax,%eax /* check for 0 */
    jnz 1f /* skip if already initialized */
    mov %esp,(sp0+const) /* save initial SP */
    mov $rstack,%ebp /* end of .bss into RS pointer */
    mov %ebp,(rp0+const) /* save initial return stack pointer */
1:  mov (sp0+const),%esp /* set our own stack pointer */
    mov (rp0+const),%ebp /* set return stack pointer */
    movl $0xff00,(__bss_start+bss_size-cellsize) /* init end of input buffer */
    call docol+cellsize /* high level for the next part */
    .long dict,fetch,lit,dictsize,z,fill /* init hash table */
    .long z,latest,fetch /* push 0 as end marker, start at end of dictionary */
2:  .long duplicate,zbranch,3f-.,duplicate,index,swap,drop,fetch,branch,2b-.
3:  .long drop /* get rid of null from LIT's backlink */
4:  .long duplicate,zbranch,5f-. /* done if reached null */
    .long index,lit,2*cellsize,plus,swap,store /* point to code in hashtable */
    .long branch,4b-. /* loop until null reached */
5:  .long drop,quit /* drop the null, launch into query-interpret loop */
/* downside to this approach: leaves COLD on return stack */
/* upside: now you can redirect QUIT to make a turnkey system */
create comma, ","
/* (n - ) ;# compile n at next available location and advance the pointer */
    mov (dp+const),%ebx /* next available location into BX */
    pop (%ebx) /* store n at HERE */
    add $cellsize,%ebx /* advance pointer */
    mov %ebx,(dp+const) /* store pointer where it belongs */
    next
create ccomma, "C,"
/* (b - ) ;# compile byte at next available location and advance pointer */
    pop %eax /* get byte to compile */
    movl (dp+const),%ebx /* get HERE */
    mov %al,(%ebx) /* store the byte */
    incl (dp+const) /* update pointer */
    next
highlevel hexnumber, "16\x23" /* 16# input hex number (as in postscript) */
/* ( - n) ;#only hexadecimal, 0-9, uppercase A-F */
    .long parse,tocode
    pop %ebx /* pointer to number */
    mov (%ebx),%cl /* get count byte */
    xor %eax,%eax  /* start off with zero */
1:  inc %ebx  /* point to next digit */
    mov (%ebx),%dl /* grab it */
    test $0x40,%dl /* see if letter (assuming only 0-9, A-F) */
    jz 2f /* skip if not */
    sub $7,%dl /* make "A" = "9" + 1 */
2:  sub $'0,%dl /* bring it to the correct value */
    shl $4,%eax /* will only work on 386 or better */
    or  %dl,%al /* merge in this digit */
    loop 1b /* until done */
    push %eax
    next /* place converted number on stack */
highlevel tick, "\x27" /* hmph, older 'as' accepted "'" */
/* ( - n) ;#take counted string and return number on stack */
    .long parse /* count the word and store at end of dictionary */
    .long index,drop,unnest /* hash the word and drop the linkword */
highlevel quit, "QUIT"
1:  .long buf,bbuf,blank,fill /* clear buffer */
    .long buf,bufptr,store /* zero buffer pointer */
    .long input,interpret,branch,1b-. /* get and process commands; loop forever */
create swap, "SWAP"
/* ( n1 n2 - n2 n1) ;#swap top 2 stack items */
    pop %eax
    pop %ebx
    push %eax
    push %ebx
    next
create plus, "+"
/* ( n1 n2 - n3) ;#returns n3=n1+n2 */
    pop %ebx
    pop %eax
    add %ebx,%eax
    push %eax
    next
create minus, "-"
/* ( n1 n2 - n3) ;#returns n3=n1-n2 */
    pop %ebx
    pop %eax
    sub %ebx,%eax
    push %eax
    next
create mask, "&"
/* (n1 n2 - n3) ;#returns n3=n1&n2 */
    pop %eax
    pop %ebx
    and %ebx,%eax
    push %eax
    next
highlevel interpret, "INTERPRET" /* stateless, loops forever */
1:  .long parse,index,drop,execute,branch,1b-. /* until null or ;S */
create null, "\0"
    jmp unnest+cellsize
create duplicate, "DUP"
/* ( n - n n) ;#duplicate top stack item */
    pop %eax
    push %eax
    push %eax
    next
create drop, "DROP"
/* ( n -) ;#eliminate top stack element */
    pop %eax
    next
create branch, "BRANCH"
    add (%esi),%esi /* update Forth IP */
    next
create zbranch, "0BRANCH"
    pop %eax /* see if zero on top of stack */
    or %eax,%eax
    jz branch+cellsize /* if so, join common code with unconditional BRANCH */
    lodsl /* else skip the branch offset */
    next
create fetch, "@"
/* ( addr - n) */
    pop %ebx
    pushl (%ebx)
    next
create cfetch, "C@"
/* ( addr - b) */
    pop %ebx
    xor %eax,%eax /* clear MSBs */
    mov (%ebx),%al /* get byte at pointer */
    push %eax
    next
highlevel here, "HERE"
	.long dp, fetch, unnest /* next compiler location */
highlevel bcreate, "[CREATE]" /* create full header ( - oldCA) */
    .long here /* first get current dictionary pointer for LATEST ! */
    .long parse,count,plus,dp,store /* counted string, first part of header */
    .long duplicate,count,ccomma,drop /* store final count byte */
    .long latest,fetch,comma /* back pointer to previous word */
    .long latest,store /* update latest to point to this word */
    .long latest,fetch,index,drop,fetch /* previous CA if any, leave on stack */
    .long here,lit,cellsize,plus,duplicate,comma /* first CA */
    .long latest,fetch,index,drop,store,unnest /* 2nd CA */
eod=.
enddict=9b /* last link in dictionary */
    .bss
    .skip bss_size,1
rstack: