✨ add lexer

2022-09-06 22:02:37 +02:00 · 2022-09-06 22:02:37 +02:00 · 68fc67389c
commit 68fc67389c
parent 949df05292
2 changed files with 624 additions and 5 deletions
--- a/src/lexer.js
+++ b/src/lexer.js
@ -15,6 +15,622 @@ class Juicescript_lexer {
 		MAIN: Run lexical analysis
 	*/
 	scan(){
-		/**/this.io.stderr.debug(this.source);
+		// RESET //
 		// counters
 		this.start = 0;
 		this.end = 0;
 		this.line = 1;
 		// token list
 		this.token_list = [];
 		// SCAN WHOLE SOURCE //
 		while(!this.is_at_end()){
 			// start where last scan ended
 			this.start = this.end;
 			// scan next token
 			this.scan_one();
 		}
 		// ADD END-OF-FILE TOKEN //
 		this.token_add({
 			type: Juicescript.token_type.EOF,
 			line: this.line
 		});
 		// RETURN LIST OF TOKENS //
 		return this.token_list;
 	}
-}
+	
 	/*
 		HELPER: Scan one token at current position
 	*/
 	scan_one(){
 		// consume next character
 		var char = this.next();
 		// scan character
 		switch(char){
 			// WHITESPACE //
 			case " ":
 			case "\r":
 			case "\t":
 				break;
 			// NEWLINE //
 			case "\n":
 				this.line++;
 				break;
 			// OPERATORS //
 			case "!":
 				if		(!this.match("="))		this.token_add({type: Juicescript.token_type.NOT});
 				else if	(!this.match("="))		this.token_add({type: Juicescript.token_type.NOT_EQUAL});
 				else							this.token_add({type: Juicescript.token_type.STRICT_NOT_EQUAL});
 				break;
 			case "=":
 				if		(!this.match("="))		this.token_add({type: Juicescript.token_type.EQUAL});
 				else if	(!this.match("="))		this.token_add({type: Juicescript.token_type.EQUAL});
 				else							this.token_add({type: Juicescript.token_type.STRICT_EQUAL});
 				break;
 			case "<":
 				if		(this.match("="))		this.token_add({type: Juicescript.token_type.LESS_EQUAL});
 				else							this.token_add({type: Juicescript.token_type.LESS});
 				break;
 			case ">":
 				if		(this.match("="))		this.token_add({type: Juicescript.token_type.GREATER_EQUAL});
 				else							this.token_add({type: Juicescript.token_type.GREATER});
 				break;
 			// BRACKETS //
 			case "[":
 				this.token_add({type: Juicescript.token_type.BRACKET_SQUARE_OPEN});
 				break;
 			case "]":
 				this.token_add({type: Juicescript.token_type.BRACKET_SQUARE_CLOSE});
 				break;
 			case "{":
 				this.token_add({type: Juicescript.token_type.BRACKET_CURLY_OPEN});
 				break;
 			case "}":
 				this.token_add({type: Juicescript.token_type.BRACKET_CURLY_CLOSE});
 				break;
 			// COMMENTS //
 			case "#":
 			case ";":
 			case "/":
 				// block comment
 				if(char === "/" && this.match("*")){
 					this.scan_block_comment();
 					break;
 				}
 				// single slash
 				if(char === "/" && !this.match("/")){
 					this.warning("unexpected character '" + char + "'");
 					break;
 				}
 				// normal comment
 				while(this.peek() !== "\n" && !this.is_at_end()) this.next();
 				break;
 			// STRINGS //
 			// handle escape sequences
 			case "\"":
 				this.scan_string(char, true);
 				break;
 			// ignore escape sequences
 			case "'":
 				this.scan_string(char, false);
 				break;
 			// VARIABLE //
 			case "$":
 				this.scan_variable();
 				break;
 			// PREFIXED FLAGS //
 			case ":":
 				this.scan_flag();
 				break;
 			// EVERYTHING ELSE //
 			default:
 				// number
 				if(this.is_digit(char)){
 					this.scan_number();
 					break;
 				}
 				// identifier
 				if(this.is_alpha(char)){
 					this.scan_identifier();
 					break;
 				}
 				// unexpected
 				this.warning("unexpected character '" + char + "'");
 				break;
 		}
 	}
 	/*
 		SCANNER: Handle string surrounded by MARKER and optionally convert ESCAPE-SEQUENCES
 	*/
 	scan_string(marker, escape_sequences){
 		// TRY TO CONSUME UNTIL END OF SOURCE //
 		while(!this.is_at_end()){
 			// do we have a quote?
 			if(this.peek() === marker){
 				// count backslashes in front of quote
 				var backslash_count = 0;
 				while(this.peek(-(backslash_count + 1)) === "\\"){
 					backslash_count++;
 				};
 				// terminate string if count of backslashes is correct
 				if(backslash_count % 2 === 0) break;
 			}
 			// take note of passed lines
 			if(this.peek() === "\n") this.line++;
 			// consume next character
 			this.next();
 		}
 		// DID WE REACH THE END OF SOURCE WITHOUT TERMINATION? //
 		if(this.is_at_end()){
 			// ignore with warning
 			this.warning("unterminated string");
 			return;
 		}
 		// GET STRING VALUE //
 		// consume closing quote
 		this.next();
 		// get consumed string
 		var string = this.source.substring(this.start + 1, this.end - 1);
 		// RESOLVE ESCAPE SEQUENCES //
 		// iterate over whole string
 		var offset = 0;
 		var pos = -1;
 		while((pos = string.indexOf("\\", offset)) > -1){
 			// defaults for escaping one character
 			var char_escaped = string.substring(pos + 1, pos + 2);
 			var replace = char_escaped;
 			var remove_length = replace.length;
 			// special escape sequences
 			switch(char_escaped){
 				// newline
 				case "n":
 					replace = "\n";
 					break;
 				// tab
 				case "t":
 					replace = "\t";
 					break;
 				// null
 				case "0":
 					replace = "\0";
 					break;
 				// unicode
 				case "u":
 					// get four-letter codepoint string
 					var next_four_chars = string.substring(pos + 2, pos + 6);
 					// check if this is valid hexadecimal
 					if(/^[0-9a-fA-F]*$/.test(next_four_chars)){
 						// convert codepoint to decimal number
 						var codepoint = parseInt(next_four_chars, 16);
 						// get corresponding unicode character
 						replace = String.fromCharCode(codepoint);
 						remove_length += 4;
 					}
 					break;
 			}
 			// if all aren't allowed, only replace essential escape sequences
 			if(replace === "\\" || replace === "'" || escape_sequences){
 				// replace in string
 				string = string.substr(0, pos) + replace + string.substr(pos + 1 + remove_length);
 			}
 			// remember we resolved this one
 			offset = pos + replace.length;
 		}
 		// ADD TOKEN //
 		this.token_add({type: Juicescript.token_type.STRING, value: string});
 	}
 	/*
 		SCANNER: Handle block comment
 	*/
 	scan_block_comment(){
 		// TRY TO CONSUME UNTIL END OF SOURCE //
 		while(!this.is_at_end()){
 			// do we have a `*/`?
 			if(this.peek(-1) === "*" && this.peek() === "/"){
 				// block comment ends here
 				break;
 			}
 			// take note of passed lines
 			if(this.peek() === "\n") this.line++;
 			// consume next character
 			this.next();
 		}
 		// DID WE REACH THE END OF SOURCE WITHOUT TERMINATION? //
 		if(this.is_at_end()){
 			// ignore with warning
 			this.warning("unterminated block comment");
 			return;
 		}
 		// consume (=ignore) closing slash
 		this.next();
 	}
 	/*
 		SCANNER: Handle variable
 	*/
 	scan_variable(){
 		// GET VARIABLE NAME //
 		// consume all valid characters
 		while(this.is_alphanumeric(this.peek())) this.next();
 		// get consumed string
 		var variable = this.source.substring(this.start + 1, this.end);
 		// CHECK IF THERE EVEN IS A NAME //
 		if(variable.length <= 0){
 			// ignore with warning
 			this.warning("unexpected character '" + this.source.charAt(this.start) + "'");
 			return;
 		}
 		// ADD TOKEN //
 		this.token_add({type: Juicescript.token_type.VARIABLE, value: variable});
 	}
 	/*
 		SCANNER: Handle number
 	*/
 	scan_number(){
 		// DEFAULT VALUES FOR BASE 10 //
 		var base = null;
 		var is_valid_char = this.is_digit;
 		var number_string_offset = 0;
 		// HANDLE OTHER BASES //
 		// check for '0' prefix
 		if(this.peek(-1) === "0"){
 			// assume we have to cut off a prefix of length 2
 			number_string_offset = 2;
 			// check
 			switch(this.peek().toLowerCase()){
 				case "b":
 					// binary (base 2)
 					base = 2;
 					is_valid_char = this.is_binary;
 					break;
 				case "o":
 					// octal (base 8)
 					base = 8;
 					is_valid_char = this.is_octal;
 					break;
 				case "x":
 					// hexadecimal (base 16)
 					base = 16;
 					is_valid_char = this.is_hexadecimal;
 					break;
 				default:
 					// didn't find valid base-char, ignore prefix
 					number_string_offset = 0;
 			}
 			// consume base-char if valid
 			if(number_string_offset > 0) this.next();
 		}
 		// GET NUMBER'S VALUE AS STRING //
 		// consume all valid chars
 		while(is_valid_char(this.peek())) this.next();
 		// allow decimal point on base 10 numbers
 		if(base === null && this.peek() === "." && is_valid_char(this.peek(1))){
 			// consume decimal point
 			this.next();
 			// consume all valid chars
 			while(is_valid_char(this.peek())) this.next();
 		}
 		// get consumed string
 		var number_string = this.source.substring(this.start + number_string_offset, this.end);
 		// STORE NUMBER IN TOKEN //
 		// parse number 
 		if(base !== null){
 			// custom base
 			var number = parseFloat(parseInt(number_string, base));
 		} else {
 			// base 10
 			var number = parseFloat(number_string);
 		}
 		// add token
 		this.token_add({type: Juicescript.token_type.NUMBER, value: number});
 	}
 	/*
 		SCANNER: Handle identifier
 	*/
 	scan_identifier(){
 		// GET IDENTIFIER NAME //
 		// consume all valid chars
 		while(this.is_alphanumeric(this.peek())) this.next();
 		// get consumed string
 		var identifier = this.source.substring(this.start, this.end);
 		// CHEKC IF THIS IS A SUFFIXED FLAG //
 		// has `:` after it?
 		if(this.match(":")){
 			// add token
 			this.token_add({type: Juicescript.token_type.FLAG, value: identifier});
 			// ignore the rest
 			return;
 		}
 		// MAYBE CONVERT IDENTIFIER TO KEYWORD //
 		// try to load from lookup table
 		var keyword = ({
 			"DEF": Juicescript.token_type.DEF,
 			"GLOB": Juicescript.token_type.GLOBAL,
 			"GLOBAL": Juicescript.token_type.GLOBAL,
 			"PUB": Juicescript.token_type.GLOBAL,
 			"PUBLIC": Juicescript.token_type.GLOBAL,
 			"END": Juicescript.token_type.END,
 			"TRUE": Juicescript.token_type.TRUE,
 			"FALSE": Juicescript.token_type.FALSE,
 			"NULL": Juicescript.token_type.NULL,
 		})[identifier.toUpperCase()] ?? null;
 		// found something?
 		if(keyword !== null){
 			// found entry: add keyword token
 			this.token_add({type: keyword});
 		} else {
 			// didn't find entry: add as identifier
 			this.token_add({type: Juicescript.token_type.IDENTIFIER, value: identifier});
 		}
 	}
 	/*
 		SCANNER: Handle flag
 	*/
 	scan_flag(){
 		// GET FLAG NAME //
 		// consume all valid characters
 		while(this.is_alphanumeric(this.peek())) this.next();
 		// get consumed string
 		var flag = this.source.substring(this.start + 1, this.end);
 		// consume (=ignore) optional `:` suffix
 		this.match(":");
 		// CHECK IF THERE EVEN IS A NAME //
 		if(flag.length <= 0){
 			// ignore with warning
 			this.warning("unexpected character '" + this.source.charAt(this.start) + "'");
 			return;
 		}
 		// ADD TOKEN //
 		this.token_add({type: Juicescript.token_type.FLAG, value: flag});
 	}
 	/*
 		HELPER: Consume next character from source
 	*/
 	next(){
 		return this.source.charAt(this.end++);
 	}
 	/*
 		HELPER: Return OFFSET next character from source
 	*/
 	peek(offset = 0){
 		return this.source.charAt(this.end + offset);
 	}
 	/*
 		HELPER: Consume (and return true) if OFFSET next character from source matches NEEDLE
 	*/
 	match(needle, offset = 0){
 		// ignore if it doesn't match
 		if(this.peek(offset) != needle) return false;
 		// consume if it matches
 		this.end += offset + 1;
 		return true;
 	}
 	/*
 		HELPER: Return if we are at end of source
 	*/
 	is_at_end(){
 		return this.end >= this.source.length;
 	}
 	/*
 		HELPER: Is CHAR a digit?
 	*/
 	is_digit(char){
 		return (char >= "0" && char <= "9");
 	}
 	/*
 		HELPER: Is CHAR a binary digit?
 	*/
 	is_binary(char){
 		return (char === "0" || char === "1");
 	}
 	/*
 		HELPER: Is CHAR a octal digit?
 	*/
 	is_octal(char){
 		return (char >= "0" && char <= "7");
 	}
 	/*
 		HELPER: Is CHAR a hexadecimal digit?
 	*/
 	is_hexadecimal(char){
 		return	(char >= "0" && char <= "9") ||
 				(char >= "a" && char <= "f") ||
 				(char >= "A" && char <= "F");
 	}
 	/*
 		HELPER: Is CHAR a char from a-z?
 	*/
 	is_alpha(char){
 		return	(char >= "a" && char <= "z") ||
 				(char >= "A" && char <= "Z");
 	}
 	/*
 		HELPER: Is CHAR a in a-z, A-Z, -, _?
 	*/
 	is_alphanumeric(char){
 		return (this.is_alpha(char) || this.is_digit(char) || char === "-" || char === "_");
 	}
 	/*
 		HELPER: Add new token object with OPTIONS
 	*/
 	token_add(options){
 		// NEW OJECT //
 		var token = {};
 		// COLLECT REQUIRED ATTRIBUTES //
 		// type
 		token.type = options.type ?? null;
 		if(!Juicescript.token_type.has(token.type)){
 			throw "invalid token type '" + token.type + "'";
 		}
 		// COLLECT ATTRIBUTES WITH POSSIBLE FALLBACK VALUES //
 		// line
 		token.line = options.line ?? this.line;
 		// lexeme
 		token.lexeme = options.lexeme ?? this.source.substring(this.start, this.end);
 		// OPTIONAL ATTRIBUTES //
 		token.value = options.value ?? null;
 		// ADD TO LIST //
 		this.token_list.push(token);
 	}
 	/*
 		HELPER: Automagically add additional info to stderr
 	*/
 	debug(text, additional){
 		// add defaults
 		additional ??= {};
 		additional.line ??= this.line;
 		// forward
 		this.io.stderr.debug(text, additional);
 	}
 	info(text, additional){
 		// add defaults
 		additional ??= {};
 		additional.line ??= this.line;
 		// forward
 		this.io.stderr.info(text, additional);
 	}
 	warning(text, additional){
 		// add defaults
 		additional ??= {};
 		additional.line ??= this.line;
 		// forward
 		this.io.stderr.warning(text, additional);
 	}
 	debug(text, additional){
 		// add defaults
 		additional ??= {};
 		additional.line ??= this.line;
 		// forward
 		this.io.stderr.debug(text, additional);
 	}
 }
--- a/src/main.js
+++ b/src/main.js
@ -2,7 +2,7 @@ class Juicescript {
 	// TOKEN TYPES //
 	static token_type = new Juicescript_helper_enum(
 		// keywords
-		"FUN", "GLOBAL", "END",
+		"DEF", "GLOBAL", "END",
 		// literals
 		"IDENTIFIER", "VARIABLE", "FLAG", "STRING", "NUMBER",
@ -18,7 +18,7 @@ class Juicescript {
 		"GREATER", "GREATER_EQUAL",
 		"LESS", "LESS_EQUAL",
-		// braces
+		// brackets
 		"BRACKET_SQUARE_OPEN", "BRACKET_SQUARE_CLOSE",
 		"BRACKET_CURLY_OPEN", "BRACKET_CURLY_CLOSE",
@ -63,6 +63,9 @@ class Juicescript {
 		// run lexical analysis
 		var token_list = lexer.scan();
-		/**/console.log(token_list);
+		/**/for(var one_token of token_list){
 			/**/one_token.type = Juicescript.token_type.name(one_token.type);
 			/**/console.log(one_token);
 		/**/}
 	}
 }