http://rosettacode.org/wiki/Compiler/lexical_analyzer#QB64Lexical AnalyzerDefinition from Wikipedia:
Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).
TaskCreate a lexical analyzer for the simple programming language specified below. The program should read input from a file and/or stdin, and write output to a file and/or stdout. If the language being used has a lexer module/library/class, it would be great if two versions of the solution are provided: One without the lexer module, and one with.
This is lex.bas:
const c_integer
= "Integer", c_ident
= "Identifier", c_string
= "String"
line_n = 1: col_n = 0: text_p = 1: the_ch = " "
case c_integer
, c_ident
, c_string: out_tok
= tok
print err_line
, err_col
, toktyp
, out_tok
print #1, err_line
, err_col
, toktyp
, out_tok
' get next tok, toktyp
toktyp = ""
restart: err_line = line_n: err_col = col_n: tok = the_ch
case "%":
call nextch: toktyp
= "Op_mod" case "(":
call nextch: toktyp
= "LeftParen" case ")":
call nextch: toktyp
= "RightParen" case "*":
call nextch: toktyp
= "Op_multiply" case "+":
call nextch: toktyp
= "Op_add" case "-":
call nextch: toktyp
= "Op_subtract" case ";":
call nextch: toktyp
= "Semicolon" case "{":
call nextch: toktyp
= "LeftBrace" case "}":
call nextch: toktyp
= "RightBrace"
case "&":
call follow
("&", "Op_and", "") case "!":
call follow
("=", "Op_notequal", "Op_not") case "<":
call follow
("=", "Op_lessequal", "Op_less") case "=":
call follow
("=", "Op_equal", "Op_assign") case ">":
call follow
("=", "Op_greaterequal", "Op_greater")
case "": toktyp
= "End_of_input"
tok = tok + the_ch
toktyp = if_both
toktyp = if_one
toktyp = c_string
tok = tok + the_ch
toktyp = c_integer
call error_exit
(err_line
, err_col
, "Empty character constant"):
exit sub
tok = "10"
tok = "92"
call error_exit
(line_n
, col_n
, "Unknown escape sequence:" + the_ch
):
exit sub
call error_exit
(line_n
, col_n
, "Multi-character constant"):
exit sub
toktyp = "Op_divide"
tok = ""
call error_exit
(line_n
, col_n
, "EOF in comment"):
exit sub
tok = tok + the_ch
case "else": toktyp
= "keyword_else" case "if": toktyp
= "keyword_if" case "print": toktyp
= "keyword_print" case "putc":: toktyp
= "keyword_putc" case "while": toktyp
= "keyword_while"
toktyp = c_integer
tok = tok + the_ch
call error_exit
(err_line
, err_col
, "Bogus number: " + tok
+ the_ch
):
exit sub
isalpha&
= c
<> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_", c
) > 0
isdigit&
= c
<> "" and instr("0123456789", c
) > 0
isalnum&
= c
<> "" and instr("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_", c
) > 0
' get next char - fold cr/lf into just lf
the_ch = ""
col_n = col_n + 1
the_ch
= mid$(source
, text_p
, 1) text_p = text_p + 1
text_p = text_p + 1
line_n = line_n + 1
col_n = 0
errors = -1
One of the samples provided, mandelp.t
{
/*
This
is an
integer ascii Mandelbrot generator
*/
left_edge = -420;
right_edge = 300;
top_edge = 300;
bottom_edge = -300;
x_step = 7;
y_step = 15;
max_iter = 200;
y0 = top_edge;
while (y0
> bottom_edge
) { x0 = left_edge;
while (x0
< right_edge
) { y = 0;
x = 0;
the_char = ' ';
i = 0;
x_x = (x * x) / 200;
y_y = (y * y) / 200;
the_char = '0' + i;
the_char = '@';
}
i = max_iter;
}
y = x * y / 100 + y0;
x = x_x - y_y + x0;
i = i + 1;
}
putc(the_char);
x0 = x0 + x_step;
}
putc('\n');
y0 = y0 - y_step;
}
}
And here is the output, from: lex mandelp.t
1 1 LeftBrace
5 5 Identifier left_edge
5 17 Op_assign
5 19 Op_subtract
5 23 Semicolon
6 5 Identifier right_edge
6 17 Op_assign
6 23 Semicolon
7 5 Identifier top_edge
7 17 Op_assign
7 23 Semicolon
8 5 Identifier bottom_edge
8 17 Op_assign
8 19 Op_subtract
8 23 Semicolon
9 5 Identifier x_step
9 17 Op_assign
9 23 Semicolon
10 5 Identifier y_step
10 17 Op_assign
10 23 Semicolon
12 5 Identifier max_iter
12 17 Op_assign
12 23 Semicolon
14 5 Identifier y0
14 8 Op_assign
14 10 Identifier top_edge
14 18 Semicolon
15 5 keyword_while
15 11 LeftParen
15 12 Identifier y0
15 15 Op_greater
15 17 Identifier bottom_edge
15 28 RightParen
15 30 LeftBrace
16 9 Identifier x0
16 12 Op_assign
16 14 Identifier left_edge
16 23 Semicolon
17 9 keyword_while
17 15 LeftParen
17 16 Identifier x0
17 19 Op_less
17 21 Identifier right_edge
17 31 RightParen
17 33 LeftBrace
18 13 Identifier y
18 15 Op_assign
18 18 Semicolon
19 13 Identifier x
19 15 Op_assign
19 18 Semicolon
20 13 Identifier the_char
20 22 Op_assign
20 27 Semicolon
21 13 Identifier i
21 15 Op_assign
21 18 Semicolon
22 13 keyword_while
22 19 LeftParen
22 20 Identifier i
22 22 Op_less
22 24 Identifier max_iter
22 32 RightParen
22 34 LeftBrace
23 17 Identifier x_x
23 21 Op_assign
23 23 LeftParen
23 24 Identifier x
23 26 Op_multiply
23 28 Identifier x
23 29 RightParen
23 31 Op_divide
23 36 Semicolon
24 17 Identifier y_y
24 21 Op_assign
24 23 LeftParen
24 24 Identifier y
24 26 Op_multiply
24 28 Identifier y
24 29 RightParen
24 31 Op_divide
24 36 Semicolon
25 17 keyword_if
25 20 LeftParen
25 21 Identifier x_x
25 25 Op_add
25 27 Identifier y_y
25 31 Op_greater
25 37 RightParen
25 39 LeftBrace
26 21 Identifier the_char
26 30 Op_assign
26 36 Op_add
26 38 Identifier i
26 39 Semicolon
27 21 keyword_if
27 24 LeftParen
27 25 Identifier i
27 27 Op_greater
27 30 RightParen
27 32 LeftBrace
28 25 Identifier the_char
28 34 Op_assign
28 39 Semicolon
29 21 RightBrace
30 21 Identifier i
30 23 Op_assign
30 25 Identifier max_iter
30 33 Semicolon
31 17 RightBrace
32 17 Identifier y
32 19 Op_assign
32 21 Identifier x
32 23 Op_multiply
32 25 Identifier y
32 27 Op_divide
32 33 Op_add
32 35 Identifier y0
32 37 Semicolon
33 17 Identifier x
33 19 Op_assign
33 21 Identifier x_x
33 25 Op_subtract
33 27 Identifier y_y
33 31 Op_add
33 33 Identifier x0
33 35 Semicolon
34 17 Identifier i
34 19 Op_assign
34 21 Identifier i
34 23 Op_add
34 26 Semicolon
35 13 RightBrace
36 13 keyword_putc
36 17 LeftParen
36 18 Identifier the_char
36 26 RightParen
36 27 Semicolon
37 13 Identifier x0
37 16 Op_assign
37 18 Identifier x0
37 21 Op_add
37 23 Identifier x_step
37 29 Semicolon
38 9 RightBrace
39 9 keyword_putc
39 13 LeftParen
39 18 RightParen
39 19 Semicolon
40 9 Identifier y0
40 12 Op_assign
40 14 Identifier y0
40 17 Op_subtract
40 19 Identifier y_step
40 25 Semicolon
41 5 RightBrace
42 1 RightBrace
43 1 End_of_input