0
0

add lexer

This commit is contained in:
DrMaxNix 2022-09-06 22:02:37 +02:00
parent 949df05292
commit 68fc67389c
2 changed files with 624 additions and 5 deletions

View File

@ -15,6 +15,622 @@ class Juicescript_lexer {
MAIN: Run lexical analysis MAIN: Run lexical analysis
*/ */
scan(){ scan(){
/**/this.io.stderr.debug(this.source); // RESET //
// counters
this.start = 0;
this.end = 0;
this.line = 1;
// token list
this.token_list = [];
// SCAN WHOLE SOURCE //
while(!this.is_at_end()){
// start where last scan ended
this.start = this.end;
// scan next token
this.scan_one();
}
// ADD END-OF-FILE TOKEN //
this.token_add({
type: Juicescript.token_type.EOF,
line: this.line
});
// RETURN LIST OF TOKENS //
return this.token_list;
} }
}
/*
HELPER: Scan one token at current position
*/
scan_one(){
// consume next character
var char = this.next();
// scan character
switch(char){
// WHITESPACE //
case " ":
case "\r":
case "\t":
break;
// NEWLINE //
case "\n":
this.line++;
break;
// OPERATORS //
case "!":
if (!this.match("=")) this.token_add({type: Juicescript.token_type.NOT});
else if (!this.match("=")) this.token_add({type: Juicescript.token_type.NOT_EQUAL});
else this.token_add({type: Juicescript.token_type.STRICT_NOT_EQUAL});
break;
case "=":
if (!this.match("=")) this.token_add({type: Juicescript.token_type.EQUAL});
else if (!this.match("=")) this.token_add({type: Juicescript.token_type.EQUAL});
else this.token_add({type: Juicescript.token_type.STRICT_EQUAL});
break;
case "<":
if (this.match("=")) this.token_add({type: Juicescript.token_type.LESS_EQUAL});
else this.token_add({type: Juicescript.token_type.LESS});
break;
case ">":
if (this.match("=")) this.token_add({type: Juicescript.token_type.GREATER_EQUAL});
else this.token_add({type: Juicescript.token_type.GREATER});
break;
// BRACKETS //
case "[":
this.token_add({type: Juicescript.token_type.BRACKET_SQUARE_OPEN});
break;
case "]":
this.token_add({type: Juicescript.token_type.BRACKET_SQUARE_CLOSE});
break;
case "{":
this.token_add({type: Juicescript.token_type.BRACKET_CURLY_OPEN});
break;
case "}":
this.token_add({type: Juicescript.token_type.BRACKET_CURLY_CLOSE});
break;
// COMMENTS //
case "#":
case ";":
case "/":
// block comment
if(char === "/" && this.match("*")){
this.scan_block_comment();
break;
}
// single slash
if(char === "/" && !this.match("/")){
this.warning("unexpected character '" + char + "'");
break;
}
// normal comment
while(this.peek() !== "\n" && !this.is_at_end()) this.next();
break;
// STRINGS //
// handle escape sequences
case "\"":
this.scan_string(char, true);
break;
// ignore escape sequences
case "'":
this.scan_string(char, false);
break;
// VARIABLE //
case "$":
this.scan_variable();
break;
// PREFIXED FLAGS //
case ":":
this.scan_flag();
break;
// EVERYTHING ELSE //
default:
// number
if(this.is_digit(char)){
this.scan_number();
break;
}
// identifier
if(this.is_alpha(char)){
this.scan_identifier();
break;
}
// unexpected
this.warning("unexpected character '" + char + "'");
break;
}
}
/*
SCANNER: Handle string surrounded by MARKER and optionally convert ESCAPE-SEQUENCES
*/
scan_string(marker, escape_sequences){
// TRY TO CONSUME UNTIL END OF SOURCE //
while(!this.is_at_end()){
// do we have a quote?
if(this.peek() === marker){
// count backslashes in front of quote
var backslash_count = 0;
while(this.peek(-(backslash_count + 1)) === "\\"){
backslash_count++;
};
// terminate string if count of backslashes is correct
if(backslash_count % 2 === 0) break;
}
// take note of passed lines
if(this.peek() === "\n") this.line++;
// consume next character
this.next();
}
// DID WE REACH THE END OF SOURCE WITHOUT TERMINATION? //
if(this.is_at_end()){
// ignore with warning
this.warning("unterminated string");
return;
}
// GET STRING VALUE //
// consume closing quote
this.next();
// get consumed string
var string = this.source.substring(this.start + 1, this.end - 1);
// RESOLVE ESCAPE SEQUENCES //
// iterate over whole string
var offset = 0;
var pos = -1;
while((pos = string.indexOf("\\", offset)) > -1){
// defaults for escaping one character
var char_escaped = string.substring(pos + 1, pos + 2);
var replace = char_escaped;
var remove_length = replace.length;
// special escape sequences
switch(char_escaped){
// newline
case "n":
replace = "\n";
break;
// tab
case "t":
replace = "\t";
break;
// null
case "0":
replace = "\0";
break;
// unicode
case "u":
// get four-letter codepoint string
var next_four_chars = string.substring(pos + 2, pos + 6);
// check if this is valid hexadecimal
if(/^[0-9a-fA-F]*$/.test(next_four_chars)){
// convert codepoint to decimal number
var codepoint = parseInt(next_four_chars, 16);
// get corresponding unicode character
replace = String.fromCharCode(codepoint);
remove_length += 4;
}
break;
}
// if all aren't allowed, only replace essential escape sequences
if(replace === "\\" || replace === "'" || escape_sequences){
// replace in string
string = string.substr(0, pos) + replace + string.substr(pos + 1 + remove_length);
}
// remember we resolved this one
offset = pos + replace.length;
}
// ADD TOKEN //
this.token_add({type: Juicescript.token_type.STRING, value: string});
}
/*
SCANNER: Handle block comment
*/
scan_block_comment(){
// TRY TO CONSUME UNTIL END OF SOURCE //
while(!this.is_at_end()){
// do we have a `*/`?
if(this.peek(-1) === "*" && this.peek() === "/"){
// block comment ends here
break;
}
// take note of passed lines
if(this.peek() === "\n") this.line++;
// consume next character
this.next();
}
// DID WE REACH THE END OF SOURCE WITHOUT TERMINATION? //
if(this.is_at_end()){
// ignore with warning
this.warning("unterminated block comment");
return;
}
// consume (=ignore) closing slash
this.next();
}
/*
SCANNER: Handle variable
*/
scan_variable(){
// GET VARIABLE NAME //
// consume all valid characters
while(this.is_alphanumeric(this.peek())) this.next();
// get consumed string
var variable = this.source.substring(this.start + 1, this.end);
// CHECK IF THERE EVEN IS A NAME //
if(variable.length <= 0){
// ignore with warning
this.warning("unexpected character '" + this.source.charAt(this.start) + "'");
return;
}
// ADD TOKEN //
this.token_add({type: Juicescript.token_type.VARIABLE, value: variable});
}
/*
SCANNER: Handle number
*/
scan_number(){
// DEFAULT VALUES FOR BASE 10 //
var base = null;
var is_valid_char = this.is_digit;
var number_string_offset = 0;
// HANDLE OTHER BASES //
// check for '0' prefix
if(this.peek(-1) === "0"){
// assume we have to cut off a prefix of length 2
number_string_offset = 2;
// check
switch(this.peek().toLowerCase()){
case "b":
// binary (base 2)
base = 2;
is_valid_char = this.is_binary;
break;
case "o":
// octal (base 8)
base = 8;
is_valid_char = this.is_octal;
break;
case "x":
// hexadecimal (base 16)
base = 16;
is_valid_char = this.is_hexadecimal;
break;
default:
// didn't find valid base-char, ignore prefix
number_string_offset = 0;
}
// consume base-char if valid
if(number_string_offset > 0) this.next();
}
// GET NUMBER'S VALUE AS STRING //
// consume all valid chars
while(is_valid_char(this.peek())) this.next();
// allow decimal point on base 10 numbers
if(base === null && this.peek() === "." && is_valid_char(this.peek(1))){
// consume decimal point
this.next();
// consume all valid chars
while(is_valid_char(this.peek())) this.next();
}
// get consumed string
var number_string = this.source.substring(this.start + number_string_offset, this.end);
// STORE NUMBER IN TOKEN //
// parse number
if(base !== null){
// custom base
var number = parseFloat(parseInt(number_string, base));
} else {
// base 10
var number = parseFloat(number_string);
}
// add token
this.token_add({type: Juicescript.token_type.NUMBER, value: number});
}
/*
SCANNER: Handle identifier
*/
scan_identifier(){
// GET IDENTIFIER NAME //
// consume all valid chars
while(this.is_alphanumeric(this.peek())) this.next();
// get consumed string
var identifier = this.source.substring(this.start, this.end);
// CHEKC IF THIS IS A SUFFIXED FLAG //
// has `:` after it?
if(this.match(":")){
// add token
this.token_add({type: Juicescript.token_type.FLAG, value: identifier});
// ignore the rest
return;
}
// MAYBE CONVERT IDENTIFIER TO KEYWORD //
// try to load from lookup table
var keyword = ({
"DEF": Juicescript.token_type.DEF,
"GLOB": Juicescript.token_type.GLOBAL,
"GLOBAL": Juicescript.token_type.GLOBAL,
"PUB": Juicescript.token_type.GLOBAL,
"PUBLIC": Juicescript.token_type.GLOBAL,
"END": Juicescript.token_type.END,
"TRUE": Juicescript.token_type.TRUE,
"FALSE": Juicescript.token_type.FALSE,
"NULL": Juicescript.token_type.NULL,
})[identifier.toUpperCase()] ?? null;
// found something?
if(keyword !== null){
// found entry: add keyword token
this.token_add({type: keyword});
} else {
// didn't find entry: add as identifier
this.token_add({type: Juicescript.token_type.IDENTIFIER, value: identifier});
}
}
/*
SCANNER: Handle flag
*/
scan_flag(){
// GET FLAG NAME //
// consume all valid characters
while(this.is_alphanumeric(this.peek())) this.next();
// get consumed string
var flag = this.source.substring(this.start + 1, this.end);
// consume (=ignore) optional `:` suffix
this.match(":");
// CHECK IF THERE EVEN IS A NAME //
if(flag.length <= 0){
// ignore with warning
this.warning("unexpected character '" + this.source.charAt(this.start) + "'");
return;
}
// ADD TOKEN //
this.token_add({type: Juicescript.token_type.FLAG, value: flag});
}
/*
HELPER: Consume next character from source
*/
next(){
return this.source.charAt(this.end++);
}
/*
HELPER: Return OFFSET next character from source
*/
peek(offset = 0){
return this.source.charAt(this.end + offset);
}
/*
HELPER: Consume (and return true) if OFFSET next character from source matches NEEDLE
*/
match(needle, offset = 0){
// ignore if it doesn't match
if(this.peek(offset) != needle) return false;
// consume if it matches
this.end += offset + 1;
return true;
}
/*
HELPER: Return if we are at end of source
*/
is_at_end(){
return this.end >= this.source.length;
}
/*
HELPER: Is CHAR a digit?
*/
is_digit(char){
return (char >= "0" && char <= "9");
}
/*
HELPER: Is CHAR a binary digit?
*/
is_binary(char){
return (char === "0" || char === "1");
}
/*
HELPER: Is CHAR a octal digit?
*/
is_octal(char){
return (char >= "0" && char <= "7");
}
/*
HELPER: Is CHAR a hexadecimal digit?
*/
is_hexadecimal(char){
return (char >= "0" && char <= "9") ||
(char >= "a" && char <= "f") ||
(char >= "A" && char <= "F");
}
/*
HELPER: Is CHAR a char from a-z?
*/
is_alpha(char){
return (char >= "a" && char <= "z") ||
(char >= "A" && char <= "Z");
}
/*
HELPER: Is CHAR a in a-z, A-Z, -, _?
*/
is_alphanumeric(char){
return (this.is_alpha(char) || this.is_digit(char) || char === "-" || char === "_");
}
/*
HELPER: Add new token object with OPTIONS
*/
token_add(options){
// NEW OJECT //
var token = {};
// COLLECT REQUIRED ATTRIBUTES //
// type
token.type = options.type ?? null;
if(!Juicescript.token_type.has(token.type)){
throw "invalid token type '" + token.type + "'";
}
// COLLECT ATTRIBUTES WITH POSSIBLE FALLBACK VALUES //
// line
token.line = options.line ?? this.line;
// lexeme
token.lexeme = options.lexeme ?? this.source.substring(this.start, this.end);
// OPTIONAL ATTRIBUTES //
token.value = options.value ?? null;
// ADD TO LIST //
this.token_list.push(token);
}
/*
HELPER: Automagically add additional info to stderr
*/
debug(text, additional){
// add defaults
additional ??= {};
additional.line ??= this.line;
// forward
this.io.stderr.debug(text, additional);
}
info(text, additional){
// add defaults
additional ??= {};
additional.line ??= this.line;
// forward
this.io.stderr.info(text, additional);
}
warning(text, additional){
// add defaults
additional ??= {};
additional.line ??= this.line;
// forward
this.io.stderr.warning(text, additional);
}
debug(text, additional){
// add defaults
additional ??= {};
additional.line ??= this.line;
// forward
this.io.stderr.debug(text, additional);
}
}

View File

@ -2,7 +2,7 @@ class Juicescript {
// TOKEN TYPES // // TOKEN TYPES //
static token_type = new Juicescript_helper_enum( static token_type = new Juicescript_helper_enum(
// keywords // keywords
"FUN", "GLOBAL", "END", "DEF", "GLOBAL", "END",
// literals // literals
"IDENTIFIER", "VARIABLE", "FLAG", "STRING", "NUMBER", "IDENTIFIER", "VARIABLE", "FLAG", "STRING", "NUMBER",
@ -18,7 +18,7 @@ class Juicescript {
"GREATER", "GREATER_EQUAL", "GREATER", "GREATER_EQUAL",
"LESS", "LESS_EQUAL", "LESS", "LESS_EQUAL",
// braces // brackets
"BRACKET_SQUARE_OPEN", "BRACKET_SQUARE_CLOSE", "BRACKET_SQUARE_OPEN", "BRACKET_SQUARE_CLOSE",
"BRACKET_CURLY_OPEN", "BRACKET_CURLY_CLOSE", "BRACKET_CURLY_OPEN", "BRACKET_CURLY_CLOSE",
@ -63,6 +63,9 @@ class Juicescript {
// run lexical analysis // run lexical analysis
var token_list = lexer.scan(); var token_list = lexer.scan();
/**/console.log(token_list); /**/for(var one_token of token_list){
/**/one_token.type = Juicescript.token_type.name(one_token.type);
/**/console.log(one_token);
/**/}
} }
} }