; ; tokenise.s ; ; Tokenise a Termite script ; ; © 1995 Straylight ; ;----- Standard Header ------------------------------------------------------ GET libs:header GET libs:swis GET libs:stream ;----- External dependencies ------------------------------------------------ GET sh.anchor GET sh.tokens GET sh.var ;----- Main code ------------------------------------------------------------ AREA |TermScript$$Code|,CODE,READONLY ; --- tokenise --- ; ; On entry: R0 == pointer to source buffer ; R1 == size of source text ; R2 == pointer to destination buffer ; R3 == flags: ; bit 0 == start reading a statement ; R11 == pointer to Termite upcall block ; R12 == pointer to script anchor ; ; On exit: May return an error ; ; Use: Tokenises a Termite script into an output buffer. EXPORT tokenise tokenise ROUT STMFD R13!,{R0-R10,R14} ;Save some registers MOV R10,R2 ;Point to output buffer MOV R7,R0 ;Point to input buffer ADD R8,R7,R1 ;Point to input buffer limit ADR R9,sail_misc ;Point to temporary buffer ANDS R14,R3,#1 ;Awaiting a statement? MOVNE R6,#tState__stmt ;Yes -- select that state MOVEQ R6,#tState__dunno ;No -- normal scanning state MOVNE R14,#1 ;We start on line 1 STR R14,[R13,#-8]! ;Save this initial value ; --- Ok, start lexing --- MOV R3,#0 ;No flags set yet 00tokenise CMP R7,R8 ;Finished yet? BHI %10tokenise ;Yes -- stop then LDRB R0,[R7],#1 ;No -- load the next byte MOVEQ R0,#-1 ;Yes -- get an EOF character MOV R14,PC ADD PC,PC,R6,LSL #2 ;And leap off into oblivion B %00tokenise B tok__stmt B tok__dunno B tok__string B tok__dblQte B tok__star B tok__decimal B tok__hex B tok__ident B tok__keyWord B tok__label B tok__expLab B tok__lineCmt B tok__blkCmt B tok__cmtStar ; --- Tidy up if we've finished --- 10tokenise MOV R14,#255 ;Mark the end of the script STRB R14,[R10],#1 ;Save that on the end ADD R13,R13,#8 ;Restore the stack pointer LDMFD R13!,{R0-R10,R14} ;And return to caller BICS PC,R14,#V_flag LTORG ; --- tok__error --- ; ; On entry: R0 == pointer to error block ; ; On exit: -- ; ; Use: Reports an error during tokenising. tok__error ROUT ADD R13,R13,#12 ;Restore the stack pointer LDMFD R13!,{R1-R10,R14} ;Restore registers too ORRS PC,R14,#V_flag ;And return the error LTORG ; --- tok__incLineNo --- ; ; On entry: -- ; ; On exit: -- ; ; Use: Increments the current line number. tok__incLineNo ROUT LDR R1,[R13,#0] ;Yes -- load line number CMP R1,#0 ;Are we scanning a file? ADDNE R1,R1,#1 ;Increment it STRNE R1,[R13,#0] ;And store it back MOVS PC,R14 LTORG ; --- tok__stmt --- tok__stmt ROUT CMP R0,#'*' ;Is this a *command? STREQB R0,[R10],#1 ;Yes -- store the * character MOVEQ R6,#tState__star ;And read a star command MOVEQS PC,R14 ;And return to caller CMP R0,#'.' ;Is this a label ORREQ R3,R3,#tFlag__readDot ;We've just read a dot MOVEQ R6,#tState__label ;Read a label, please MOVEQ R4,#0 ;No character read yet MOVEQS PC,R14 ;And return to caller ; Drop through to dunno ; --- tok__dunno --- tok__dunno ROUT ; --- Ignore whitespace --- ; ; Other states will have dealt with this as necessary already ; so we don't need to bother. MOV R2,R14 ;Preserve the link CMP R0,#10 ;Is it a newline STREQB R0,[R10],#1 ;Yes -- store it in buffer BLEQ tok__incLineNo ;Increment the line number MOVEQ R6,#tState__stmt ;And start a new statement CMPNE R0,#' ' ;Is it a space CMPNE R0,#9 ;Or a tab? MOVEQS PC,R2 ;Yes -- don't do anything MOV R14,R2 ;Put the link back ; --- Now find an appropriate state --- CMP R0,#'/' ;Is this a slash? STREQ R6,[R13,#4] ;Yes -- save old state away SUBS R1,R0,#'+' ;Check for strange chars SUBNES R1,R0,#'-' ;Check for strange chars SUBNES R1,R0,#'*' ;Check for strange chars SUBNES R1,R0,#'/' ;Check for strange chars SUBNES R1,R0,#'<' ;Check for strange chars SUBNES R1,R0,#'>' ;Check for strange chars SUBNE R1,R0,#'A' ;Otherwise check uppercase CMP R1,#26 ;Is it in the right range? SUBCC R7,R7,#1 ;Yes -- backtrack one char MOVCC R6,#tState__keyWord ;And read a keyword MOVCC R5,#0 ;Entry in state table MOVCC R4,#0 ;No characters read yet MOVCC R2,#0 ;No token discovered yet MOVCCS PC,R14 ;And return to caller MOV R6,#tState__dunno ;Read char, so not new stmt CMP R0,#'"' ;Is it a string? STREQB R0,[R10],#1 ;Yes -- store opening quote MOVEQ R6,#tState__string ;And read a string literal MOVEQS PC,R14 ;And return to caller SUB R1,R0,#'0' ;Check if it's a digit CMP R1,#10 ;Is it in the right range? STRCCB R0,[R10],#1 ;Yes -- store digit MOVCC R6,#tState__decimal ;And read a decimal number MOVCCS PC,R14 ;And return to caller CMP R0,#'&' ;Also check for hex numbers STREQB R0,[R10],#1 ;Yes -- store the ampersand MOVEQ R6,#tState__hex ;And read a hex number MOVEQS PC,R14 ;And return to caller SUBS R1,R0,#'_' ;Is this an underscore? SUBNE R1,R0,#'a' ;Check if it's a lowercase CMP R1,#26 ;Is it in the right range? STRCCB R0,[R10],#1 ;Yes -- store the character MOVCC R6,#tState__ident ;And read an identifier MOVCCS PC,R14 ;And return to caller CMP R0,#':' ;Is this a colon? STREQB R0,[R10],#1 ;Yes -- store the character MOVEQ R6,#tState__stmt ;And start a new statement MOVEQS PC,R14 ;And return to caller STRB R0,[R10],#1 MOVS PC,R14 ;And return to caller LTORG ; --- tok__star --- tok__star ROUT MOV R2,R14 ;Preserve the link STRB R0,[R10],#1 ;Save the character CMP R0,#10 ;Is this a newline? MOVEQ R6,#tState__stmt ;Start a new statement BLEQ tok__incLineNo ;Increment the line number MOVS PC,R2 ;And return to caller LTORG ; --- tok__string --- tok__string ROUT STRB R0,[R10],#1 ;Save the character CMP R0,#'"' ;Is it another quote? MOVEQ R6,#tState__dblQte ;Yes -- change state MOVEQS PC,R14 ;And return MOV R2,R14 ;Preserve the link CMP R0,#10 ;Is it newline? MOVEQ R6,#tState__stmt ;Yes -- change state BLEQ tok__incLineNo ;...increment line number MOVS PC,R2 ;And return LTORG ; --- tok__dblQte --- tok__dblQte ROUT CMP R0,#'"' ;Is this a 2nd quote? MOVEQ R6,#tState__string ;Yes -- go back to string STREQB R0,[R10],#1 ;..and store it away SUBNE R7,R7,#1 ;Otherwise backtrack MOVNE R6,#tState__dunno ;..and enter dunno state MOVS PC,R14 LTORG ; --- tok__decimal --- tok__decimal ROUT SUB R1,R0,#'0' ;Set up for range check CMP R1,#10 ;Are we in range? STRCCB R0,[R10],#1 ;Yes -- store the number MOVCCS PC,R14 ;And return ; --- A bit of bodgery now --- ; ; This hackery introduces a space between two numbers, which ; would otherwise severely upset something like ; ; DIM a%!24 64 tok__numHack CMP R0,#&20 ;Is this a space? BNE %f00 ;No -- just stop normally LDRB R1,[R7,#0] ;Get the next byte SUB R1,R1,#'0' ;Is it a digit? CMP R1,#10 ;Quick check STRCCB R0,[R10],#1 ;Yes -- store the space 00 SUB R7,R7,#1 ;Backtrack a little MOV R6,#tState__dunno ;...and change state MOVS PC,R14 ;Return to caller LTORG ; --- tok__hex --- tok__hex ROUT SUB R1,R0,#'a' ;Set up for range check CMP R1,#6 ;Are we in range? SUBCC R0,R0,#'a'-'A' ;Force to uppercase SUBCS R1,R0,#'0' CMPCS R1,#10 SUBCS R1,R0,#'A' CMPCS R1,#6 STRCCB R0,[R10],#1 ;Yes -- store the number MOVCCS PC,R14 ;And return ; --- Hack as above --- B tok__numHack ;Use hacking code above LTORG ; --- tok__ident --- tok__ident ROUT CMP R0,#'$' ;Is it a dollar sign? CMPNE R0,#'%' ;Or a percentage? STREQB R0,[R10],#1 ;Yes -- store it then MOVEQ R6,#tState__dunno ;Change state MOVEQS PC,R14 ;And return to caller SUBS R1,R0,#'_' ;Is it an underscore? SUBNE R1,R0,#'0' ;Or a number? CMP R1,#10 SUBCS R1,R0,#'A' ;Or a capital letter? CMPCS R1,#26 SUBCS R1,R0,#'a' ;Or a lowercase letter? CMPCS R1,#26 MOVCS R1,#' ' ;If not valid, append space MOVCC R1,R0 ;Otherwise write character STRB R1,[R10],#1 ;Store a character SUBCS R7,R7,#1 ;No -- backtrack a little MOVCS R6,#tState__dunno ;...and change state MOVS PC,R14 ;Return to caller ; --- tok__label --- tok__label ROUT SUBS R1,R0,#'_' ;Is it an underscore? SUBNE R1,R0,#'0' ;Or a number? CMP R1,#10 SUBCS R1,R0,#'A' ;Or a capital letter? CMPCS R1,#26 SUBCS R1,R0,#'a' ;Or a lowercase letter? CMPCS R1,#26 BCS %05tok__label ;No -- do other things then TST R3,#tFlag__readDot + tFlag__readDEF STREQB R0,[R10],#1 ;No -- store the number STRNEB R0,[R9],#1 ;Otherwise store in scratch MOVS PC,R14 ;...and return ; --- Are we defining this label? --- 05tok__label TST R3,#tFlag__readDot + tFlag__readDEF SUBEQ R7,R7,#1 ;No -- backtrack a little MOVEQ R6,#tState__dunno ;...change state MOVEQS PC,R14 ;...and return ; --- Create the variable then --- STMFD R13!,{R3,R14} ;Preserve R3 and link MOV R14,#0 ;Terminate scratch buffer STRB R14,[R9],#1 ;To make things nice TST R3,#tFlag__readDot ;Have we just read a dot? MOVNE R0,#vType_label ;Yes -- create a label BNE %10tok__label ;...and jump ahead TST R3,#tFlag__readFN ;Is this a DEFFN? MOVNE R0,#vType_fn ;Yes -- define one of these MOVEQ R0,#vType_proc ;No -- define a DEFPROC then 10tok__label ADR R9,sail_misc ;Point to scratch start MOV R1,R9 ;Point to label name MOV R2,R10 ;Get the file address LDR R3,[R13,#8] ;Load the line number CMP R3,#0 ;Are we scanning the file? BLNE var_create ;Create the variable LDMVSFD R13!,{R3,R14} ;If it failed, unstack... BVS tok__error ;...and die horridly SUB R7,R7,#1 ;No -- backtrack a little MOV R6,#tState__dunno ;...change state LDMFD R13!,{R3,R14} ;Restore flags word BIC R3,R3,#tFlag__readDot + tFlag__readDEF MOVS PC,R14 ;Return to caller LTORG ; --- tok__expLab --- ROUT tok__expLab CMP R0,#' ' ;Is it a space? CMPNE R0,#9 ;Or a TAB char? SUBNE R7,R7,#1 ;No -- backtrack a little MOVNE R6,#tState__label ;...we are reading a label MOVS PC,R14 ;Return LTORG ; --- tok__lineCmt --- tok__lineCmt ROUT MOV R2,R14 ;Preserve the link CMP R0,#10 ;Is this a newline? STREQB R0,[R10],#1 ;Save the newline character MOVEQ R6,#tState__stmt ;Start a new statement BLEQ tok__incLineNo ;Increment the line number MOVS PC,R2 ;And return to caller LTORG ; --- tok__blkCmt --- tok__blkCmt ROUT MOV R2,R14 ;Preserve the link CMP R0,#10 ;Is this a newline? MOVEQ R0,#31 ;Yes -- insert a weird char STREQB R0,[R10],#1 ;Put it in the buffer BLEQ tok__incLineNo ;Increment the line number MOVEQS PC,R2 ;And return CMP R0,#'*' ;Is it a star? MOVEQ R6,#tState__cmtStar ;Yes -- change mode then MOVS PC,R2 ;Return to caller LTORG ; --- tok__cmtStar --- tok__cmtStar ROUT MOV R2,R14 ;Preserve the link CMP R0,#10 ;Is this a newline? MOVEQ R1,#31 ;Yes -- insert a weird char STREQB R1,[R10],#1 ;Put it in the buffer BLEQ tok__incLineNo ;Increment the line number CMP R0,#'/' ;Is the comment over now? LDREQ R6,[R13,#4] ;Yes -- load previous state CMPNE R0,#'*' ;Is it still a star? MOVNE R6,#tState__blkCmt ;No -- change state back MOVS PC,R2 ;And return to caller LTORG ; --- tok__keyWord --- tok__keyWord ROUT STMFD R13!,{R14} ADR R1,tokTable ;Point to the toaken table ADD R1,R1,R5,LSR #16 ;Point into the table CMP R0,#'.' ;Is this a dot? BEQ %18tok__keyWord ;Yes -- jump ahead then ADD R4,R4,#1 ;Increment char count 10tok__keyWord LDR R14,[R1],#4 ;Load LSB CMP R14,#0 ;Is this the end? BEQ %15tok__keyWord ;Yes -- jump ahead CMP R0,R14,LSR #24 ;Is this a match? BNE %10tok__keyWord ;No -- keep looking BIC R14,R14,#&FF000000 ;Clear char to match byte MOVS R0,R14,LSR #16 ;Get the token byte MOVNE R2,R0 ;This is a token MOVNE R4,#0 ;So clear backtrack count MOVS R5,R14,LSL #16 ;Shift it up a bit LDMNEFD R13!,{PC}^ ;And return to caller ; --- Come to the end of the line --- 15tok__keyWord SUB R7,R7,R4 ;Do the backtracking CMP R2,#0 ;Did we find a token? MOVEQ R6,#tState__ident ;No -- read an identifier LDMEQFD R13!,{PC}^ ;Bad luck then ; --- We have found a match --- 11tok__keyWord LDMFD R13!,{R14} ;Restore return address MOV R5,R2 ;Get the matched token ; --- Skip over REMS --- CMP R5,#tok_rem ;Check for REM statements CMPNE R5,#tok_DD ;Or a // comment MOVEQ R6,#tState__lineCmt ;Introduces line comments MOVEQS PC,R14 ;Return if it was one CMP R5,#tok_DT ;Is it a /* comment? MOVEQ R6,#tState__blkCmt ;Yes -- it's a block comment MOVEQS PC,R14 ;And return to caller ; --- Set up various flags and things --- 17tok__keyWord STRB R5,[R10],#1 ;Store in the block BIC R3,R3,#tFlag__readFN+tFlag__readPROC CMP R5,#tok_proc ;Is this a PROC? ORREQ R3,R3,#tFlag__readPROC ;Yes -- remember this CMP R5,#tok_fn ;Or a FN? ORREQ R3,R3,#tFlag__readFN ;Yes -- remember this TST R3,#tFlag__readPROC+tFlag__readFN MOVNE R6,#tState__label ;If either -- change state MOVNES PC,R14 ;...and return BIC R3,R3,#tFlag__readDEF ;No -- clear DEF flag CMP R5,#tok_def ;Was it a DEF then? ORREQ R3,R3,#tFlag__readDEF ;Yes -- set the def flag ; --- Are we expecting a label next? --- CMP R5,#tok_goto ;Is there a label next? CMPNE R5,#tok_gosub CMPNE R5,#tok_restore MOVEQ R6,#tState__expLab ;Yes -- change state MOVEQ R4,#0 ;...No characters read yet BIC R3,R3,#tFlag__readDot ;We are not expecting a dot MOVEQS PC,R14 ;...and return ; --- Return to caller --- MOV R6,#tState__dunno ;Change state back again MOVS PC,R14 ;And return to caller ; --- User has abbreviated key word --- 18tok__keyWord ADR R0,tokTable ;Point to the table 19tok__keyWord LDR R5,[R1,#0] ;Load the next index MOVS R4,R5,LSL #16 ;Shift it up a bit ADDNE R1,R0,R4,LSR #16 ;If more to go -- point BNE %19tok__keyWord ;...and keep on looping BIC R2,R5,#&FF000000 ;Clear the match char MOV R2,R2,LSR #16 ;And get the final token B %11tok__keyWord ;Deal with the key word GET sh.tokTable LTORG ; --- States for the tokeniser --- ^ 0 tState__stmt # 1 ;Start of a new statement tState__dunno # 1 ;Not sure what to expect tState__string # 1 ;Tokenising a string tState__dblQte # 1 ;Checking for double quotes tState__star # 1 ;Processing a *command tState__decimal # 1 ;Reading a decimal/bin number tState__hex # 1 ;Reading a hex number tState__ident # 1 ;Processing an identifier tState__keyWord # 1 ;Checking for keywords tState__label # 1 ;Reading a label tState__expLab # 1 ;Waiting for a label tState__lineCmt # 1 ;Skipping a line comment tState__blkCmt # 1 ;Skipping a block comment tState__cmtStar # 1 ;Found star in block comment ; --- Flags --- tFlag__readDot EQU (1<<0) ;Creating a label tFlag__readDEF EQU (1<<1) ;We're doing a def tFlag__readFN EQU (1<<5) ;Just read a FN tFlag__readPROC EQU (1<<6) ;Just read a PROC ;----- Workspace ------------------------------------------------------------ ;----- That's all, folks ---------------------------------------------------- END