legacy-code/src/lexer.js

class Juicescript_lexer {
	/*
		CONSTRUCTOR: Return new juicescript lexer for SOURCE with OPTIONS
	*/
	constructor(source, options){
		// STORE ARGUMENTS //
		// source
		this.source = source;

		// io adapter
		this.io = options.io;
	}

	/*
		MAIN: Run lexical analysis
	*/
	scan(){
		// RESET //
		// counters
		this.start = 0;
		this.end = 0;
		this.line = 1;

		// token list
		this.token_list = [];

		// warning and error counter
		this.warning_count = 0;
		this.error_count = 0;


		// SCAN WHOLE SOURCE //
		while(!this.is_at_end()){
			// start where last scan ended
			this.start = this.end;

			// consume next character
			this.next();

			// scan next token
			this.scan_one();
		}


		// ADD END-OF-FILE TOKEN //
		this.token_add({
			type: Juicescript.token_type.EOF,
			line: this.line,
			lexeme: ""
		});


		// RETURN LIST OF TOKENS //
		return this.token_list;
	}

	/*
		HELPER: Scan one token at current position
	*/
	scan_one(){
		switch(this.char){
			// WHITESPACE //
			case " ":
			case "\r":
			case "\t":
				break;


			// DELIMITER //
			case ";":
				this.token_add({type: Juicescript.token_type.DELIMITER});
				break;


			// NEWLINE //
			case "\n":
				this.token_add({type: Juicescript.token_type.DELIMITER});
				this.line++;
				break;


			// OPERATORS //
			case "!":
				if		(this.match("="))		this.token_add({type: Juicescript.token_type.NOT_EQUAL});
				else							this.token_add({type: Juicescript.token_type.NOT});
				break;

			case "=":
				if		(this.match("="))		this.token_add({type: Juicescript.token_type.EQUAL_EQUAL});
				else							this.error("unexpected character '" + this.char + "'");
				break;

			case "<":
				if		(this.match("="))		this.token_add({type: Juicescript.token_type.LESS_EQUAL});
				else							this.token_add({type: Juicescript.token_type.LESS});
				break;

			case ">":
				if		(this.match("="))		this.token_add({type: Juicescript.token_type.GREATER_EQUAL});
				else							this.token_add({type: Juicescript.token_type.GREATER});
				break;


			// BRACKETS //
			case "[":
				this.token_add({type: Juicescript.token_type.BRACKET_SQUARE_OPEN});
				break;

			case "]":
				this.token_add({type: Juicescript.token_type.BRACKET_SQUARE_CLOSE});
				break;

			case "{":
				this.token_add({type: Juicescript.token_type.BRACKET_CURLY_OPEN});
				break;

			case "}":
				this.token_add({type: Juicescript.token_type.BRACKET_CURLY_CLOSE});
				break;


			// COMMENTS //
			case "#":
			case "/":
				// block comment
				if(this.char === "/" && this.match("*")){
					this.scan_block_comment();
					break;
				}

				// single slash
				if(this.char === "/" && !this.match("/")){
					this.error("unexpected character '" + this.char + "'");
					break;
				}

				// normal comment
				while(this.peek() !== "\n" && !this.is_at_end()) this.next();
				break;


			// STRINGS //
			// handle escape sequences
			case "\"":
				this.scan_string(this.char, true);
				break;

			// ignore escape sequences
			case "'":
				this.scan_string(this.char, false);
				break;


			// VARIABLE //
			case "$":
				this.scan_variable();
				break;


			// PREFIXED FLAGS //
			case ":":
				this.scan_flag();
				break;


			// NEGATIVE NUMBERS //
			case "-":
				// only if there's a valid digit after it
				if(this.is_digit(this.peek())){
					// consume minus sign
					this.next();

					// scan like a normal number
					this.scan_number();
					break;
				}

				// ignore with error
				this.error("unexpected character '" + this.char + "'");
				break;


			// SPECIAL CHARS //
			// ampersand
			case "&":
				this.token_add({type: Juicescript.token_type.AMPERSAND});
				break;

			// question mark
			case "?":
				this.token_add({type: Juicescript.token_type.QUESTION_MARK});
				break;


			// EVERYTHING ELSE //
			default:
				// numbers
				if(this.is_digit(this.char)){
					this.scan_number();
					break;
				}

				// identifiers
				if(this.is_alpha(this.char)){
					this.scan_identifier();
					break;
				}

				// unexpected (ignore with error)
				this.error("unexpected character '" + this.char + "'");
				break;
		}
	}

	/*
		SCANNER: Handle string surrounded by MARKER and optionally convert ESCAPE-SEQUENCES
	*/
	scan_string(marker, escape_sequences){
		// TRY TO CONSUME UNTIL END OF SOURCE //
		while(!this.is_at_end()){
			// do we have a quote?
			if(this.peek() === marker){
				// count backslashes in front of quote
				let backslash_count = 0;
				while(this.peek(-(backslash_count + 1)) === "\\"){
					backslash_count++;
				};

				// terminate string if count of backslashes is correct
				if(backslash_count % 2 === 0) break;
			}

			// take note of passed lines
			if(this.peek() === "\n") this.line++;

			// consume next character
			this.next();
		}


		// DID WE REACH THE END OF SOURCE WITHOUT TERMINATION? //
		if(this.is_at_end()){
			// ignore with error
			this.error("unterminated string");
			return;
		}


		// GET STRING VALUE //
		// consume closing quote
		this.next();

		// get consumed string
		let string = this.source.substring(this.start + 1, this.end - 1);


		// RESOLVE ESCAPE SEQUENCES //
		// iterate over whole string
		let offset = 0;
		let pos = -1;
		while((pos = string.indexOf("\\", offset)) > -1){
			// defaults for escaping one character
			let char_escaped = string.substring(pos + 1, pos + 2);
			let replace = char_escaped;
			let remove_length = replace.length;

			// special escape sequences
			switch(char_escaped){
				// newline
				case "n":
					replace = "\n";
					break;

				// tab
				case "t":
					replace = "\t";
					break;

				// null
				case "0":
					replace = "\0";
					break;

				// unicode
				case "u":
					// get four-letter codepoint string
					let next_four_chars = string.substring(pos + 2, pos + 6);

					// check if this is valid hexadecimal
					if(/^[0-9a-fA-F]*$/.test(next_four_chars)){
						// convert codepoint to decimal number
						let codepoint = parseInt(next_four_chars, 16);

						// get corresponding unicode character
						replace = String.fromCharCode(codepoint);
						remove_length += 4;
					}
					break;
			}

			// if all aren't allowed, only replace essential escape sequences
			if(replace === "\\" || replace === "'" || escape_sequences){
				// replace in string
				string = string.substr(0, pos) + replace + string.substr(pos + 1 + remove_length);
			}

			// remember we resolved this one
			offset = pos + replace.length;
		}


		// ADD TOKEN //
		this.token_add({type: Juicescript.token_type.STRING, value: string});
	}

	/*
		SCANNER: Handle block comment
	*/
	scan_block_comment(){
		// TRY TO CONSUME UNTIL END OF SOURCE //
		while(!this.is_at_end()){
			// do we have a `*/`?
			if(this.char === "*" && this.peek() === "/"){
				// block comment ends here
				break;
			}

			// take note of passed lines
			if(this.peek() === "\n") this.line++;

			// consume next character
			this.next();
		}


		// DID WE REACH THE END OF SOURCE WITHOUT TERMINATION? //
		if(this.is_at_end()){
			// ignore with error
			this.error("unterminated block comment");
			return;
		}

		// consume (=ignore) closing slash
		this.next();
	}

	/*
		SCANNER: Handle variable
	*/
	scan_variable(){
		// GET VARIABLE NAME //
		// consume all valid characters
		while(this.is_alphanumeric(this.peek())) this.next();

		// get consumed string
		let variable = this.source.substring(this.start + 1, this.end);


		// ADD TOKEN //
		this.token_add({type: Juicescript.token_type.VARIABLE, value: variable});
	}

	/*
		SCANNER: Handle number
	*/
	scan_number(){
		// DEFAULT VALUES FOR POSITIVE BASE 10 NUMBER //
		let negative = false;
		let base = null;
		let is_valid_char = this.is_digit;
		let number_string_offset = 0;


		// HANDLE OTHER BASES //
		// check for '0' prefix
		if(this.char === "0"){
			// assume we have to cut off a prefix of length 2
			number_string_offset = 2;

			// check
			switch(this.peek().toLowerCase()){
				case "b":
					// binary (base 2)
					base = 2;
					is_valid_char = this.is_binary;
					break;

				case "o":
					// octal (base 8)
					base = 8;
					is_valid_char = this.is_octal;
					break;

				case "x":
					// hexadecimal (base 16)
					base = 16;
					is_valid_char = this.is_hexadecimal;
					break;

				default:
					// didn't find valid base-char, ignore prefix
					number_string_offset = 0;
			}

			// consume base-char if valid
			if(number_string_offset > 0) this.next();
		}


		// GET NUMBER'S VALUE AS STRING //
		// consume all valid chars
		while(is_valid_char(this.peek())) this.next();

		// allow decimal point on base 10 numbers
		if(base === null && this.peek() === "." && is_valid_char(this.peek(1))){
			// consume decimal point
			this.next();

			// consume all valid chars
			while(is_valid_char(this.peek())) this.next();
		}

		// get consumed string
		let number_string_full = this.source.substring(this.start, this.end);


		// HANDLE NEGATIVE NUMBERS //
		if(number_string_full.substring(0, 1) === "-"){
			// remember to negate later
			negative = true;

			// ignore minus sign
			number_string_offset++;
		}


		// STORE NUMBER IN TOKEN //
		// get number string
		let number_string = number_string_full.substring(number_string_offset);

		// parse number
		let number;
		if(base !== null){
			// custom base
			number = parseFloat(parseInt(number_string, base));

		} else {
			// base 10
			number = parseFloat(number_string);
		}

		// maybe negate
		if(negative) number *= -1;

		// add token
		this.token_add({type: Juicescript.token_type.NUMBER, value: number});
	}

	/*
		SCANNER: Handle identifier
	*/
	scan_identifier(){
		// GET IDENTIFIER NAME //
		// consume all valid chars
		while(this.is_alphanumeric(this.peek())) this.next();

		// get consumed string
		let identifier = this.source.substring(this.start, this.end);


		// CHECK IF THIS IS A SUFFIXED FLAG //
		// has `:` after it?
		if(this.match(":")){
			// add token
			this.token_add({type: Juicescript.token_type.FLAG, value: identifier});

			// ignore the rest
			return;
		}


		// MAYBE CONVERT IDENTIFIER TO KEYWORD //
		// try to load from lookup table
		let keyword = ({
			"DEF": Juicescript.token_type.DEF,

			"GLOB": Juicescript.token_type.GLOBAL,
			"GLOBAL": Juicescript.token_type.GLOBAL,
			"PUB": Juicescript.token_type.GLOBAL,
			"PUBLIC": Juicescript.token_type.GLOBAL,

			"END": Juicescript.token_type.END,

			"TRUE": Juicescript.token_type.TRUE,
			"FALSE": Juicescript.token_type.FALSE,

			"NULL": Juicescript.token_type.NULL,
		})[identifier.toUpperCase()] ?? null;

		// found something?
		if(keyword !== null){
			// found entry: add keyword token
			this.token_add({type: keyword});

		} else {
			// didn't find entry: add as identifier
			this.token_add({type: Juicescript.token_type.IDENTIFIER, value: identifier});
		}
	}

	/*
		SCANNER: Handle flag
	*/
	scan_flag(){
		// GET FLAG NAME //
		// consume all valid characters
		while(this.is_alphanumeric(this.peek())) this.next();

		// get consumed string
		let flag = this.source.substring(this.start + 1, this.end);

		// consume (=ignore) optional `:` suffix
		this.match(":");


		// CHECK IF THERE EVEN IS A NAME //
		if(flag.length <= 0){
			// ignore with error
			this.error("unexpected character '" + this.source.charAt(this.start) + "'");
			return;
		}


		// ADD TOKEN //
		this.token_add({type: Juicescript.token_type.FLAG, value: flag});
	}

	/*
		HELPER: Consume next character from source
	*/
	next(){
		this.char = this.source.charAt(this.end++);
	}

	/*
		HELPER: Return OFFSET next character from source
	*/
	peek(offset = 0){
		return this.source.charAt(this.end + offset);
	}

	/*
		HELPER: Consume (and return true) if OFFSET next character from source matches NEEDLE
	*/
	match(needle, offset = 0){
		// ignore if it doesn't match
		if(this.peek(offset) != needle) return false;

		// consume if it matches
		this.end += offset;
		this.next();
		return true;
	}

	/*
		HELPER: Return if we are at end of source
	*/
	is_at_end(){
		return this.end >= this.source.length;
	}

	/*
		HELPER: Is CHAR a digit?
	*/
	is_digit(char){
		return (char >= "0" && char <= "9");
	}

	/*
		HELPER: Is CHAR a binary digit?
	*/
	is_binary(char){
		return (char === "0" || char === "1");
	}

	/*
		HELPER: Is CHAR a octal digit?
	*/
	is_octal(char){
		return (char >= "0" && char <= "7");
	}

	/*
		HELPER: Is CHAR a hexadecimal digit?
	*/
	is_hexadecimal(char){
		return	(char >= "0" && char <= "9") ||
				(char >= "a" && char <= "f") ||
				(char >= "A" && char <= "F");
	}

	/*
		HELPER: Is CHAR a char from a-z?
	*/
	is_alpha(char){
		return	(char >= "a" && char <= "z") ||
				(char >= "A" && char <= "Z");
	}

	/*
		HELPER: Is CHAR a in a-z, A-Z, -, _?
	*/
	is_alphanumeric(char){
		return (this.is_alpha(char) || this.is_digit(char) || char === "-" || char === "_");
	}

	/*
		HELPER: Add new token object with OPTIONS
	*/
	token_add(options){
		// NEW OJECT //
		let token = {};


		// COLLECT REQUIRED ATTRIBUTES //
		// type
		token.type = options.type ?? null;
		if(!Juicescript.token_type.has(token.type)){
			throw "invalid token type '" + token.type + "'";
		}


		// COLLECT ATTRIBUTES WITH POSSIBLE FALLBACK VALUES //
		// line
		token.line = options.line ?? this.line;

		// lexeme
		token.lexeme = options.lexeme ?? this.source.substring(this.start, this.end);


		// OPTIONAL ATTRIBUTES //
		token.value = options.value ?? null;


		// ADD TO LIST //
		this.token_list.push(token);
	}

	/*
		HELPER: Automagically keep track of problems and add additional info to stderr
	*/
	debug(text, additional){
		// add defaults
		additional ??= {};
		additional.line ??= this.line;

		// forward
		this.io.stderr.debug(text, additional);
	}
	info(text, additional){
		// add defaults
		additional ??= {};
		additional.line ??= this.line;

		// forward
		this.io.stderr.info(text, additional);
	}
	warning(text, additional){
		// KEEP TRACK OF PROBLEM //
		this.warning_count++;


		// PRINT MESSAGE //
		// add defaults
		additional ??= {};
		additional.line ??= this.line;

		// forward
		this.io.stderr.warning(text, additional);
	}
	error(text, additional){
		// KEEP TRACK OF PROBLEM //
		this.error_count++;


		// PRINT MESSAGE //
		// add defaults
		additional ??= {};
		additional.line ??= this.line;

		// forward
		this.io.stderr.error(text, additional);
	}
}