前言

看了一会儿第二章的理论知识,感觉跟学校之前上课学的差不多,所以看起来顺了很多,幸好曾经选了编译原理

这个PA2的实验就是针对第二章——词法分析的

做实验的难度在于

  1. 需要对COOL语言熟悉
  2. 需要对flex所熟悉

关于COOL编程语言,查看手册(重点关注10、11章节):

https://web.stanford.edu/class/cs143/materials/cool-manual.pdf

关于flex的话,教程也很多,手册:

http://westes.github.io/flex/manual/

How to start

那么本实验该如何下手呢?

词法分析

首先得理解词法分析的作用是什么,输入应该是什么,输出应该是什么

总结来说,词法分析的工作过程如下所示:

image.png

词法分析器会对输入的字符串进行切割和分类,给子字符串(substring)赋予标记类(class),然后将<class, substring>的序列对传给语法分析器,其中,<class, substring>就称为token(词法单元)

例如,"foo = 42"的处理结果即为:

  • <Identifier, “foo”>
  • <Operator, “=”>
  • <Integer, “42”>

接下来以example目录下的hello_world.cl为例,运行bin目录下的词法分析器lexer,看看正确的词法分析器的输出应该是啥样的:

image.png

再来看看hello_world.cl的代码:

1
2
3
4
5
class Main inherits IO {
main(): SELF_TYPE {
out_string("Hello, World.\n")
};
};

大概有点感觉了?

flex

例子

这有个简单的flex例子:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*flex-wc.l*/
%option noyywrap
%{
int chars = 0;
int words = 0;
int lines = 0;
%}
%%
[a-zA-z]+ { words++; chars += strlen(yytext); }
\n { lines++; chars++; }
. { chars++; }
%%
int main() {
yylex();
printf("%8d %8d %8d\n", lines, words, chars);
return 0;
}

其实就是写正则表达式,匹配相应的集合并做处理

在编写flex文件前,还得熟悉几个变量

变量

  • cool_yylval,该变量有如下属性:
    • error_msg(报错)
    • symbol(符号表中变量)
    • boolean(布尔变量的值)
  • idtable:Identifier标识符表
  • inttable:整数表
  • stringtable:字符串表
  • curr_lineno:行号

校验

当成功编写完flex代码,如何检查?

官方提供了自查脚本:

1
2
wget https://courses.edx.org/assets/courseware/v1/2aa4dec0c84ec3a8d91e0c1d8814452b/asset-v1:StanfordOnline+SOE.YCSCS1+1T2020+type@asset+block/pa1-grading.pl
perl ./pa1-grading.pl

grading目录下有相应的测试所用COOL代码,可以拿来运行,看看跟bin目录下lexer的运行结果的差别在哪里,从而改进

flex代码

编写的过程遇到的一个难点就是各种特殊字符的考虑,如双引号、转义字符等,难免会有遗漏或考虑不周的地方

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/*
* The scanner definition for COOL.
*/

/*
* Stuff enclosed in %{ %} in the first section is copied verbatim to the
* output, so headers and global definitions are placed here to be visible
* to the code in the file. Don't remove anything that was here initially
*/
%{
#include <cool-parse.h>
#include <stringtab.h>
#include <utilities.h>

/* The compiler assumes these identifiers. */
#define yylval cool_yylval
#define yylex cool_yylex

/* Max size of string constants */
#define MAX_STR_CONST 1025
#define YY_NO_UNPUT /* keep g++ happy */

extern FILE *fin; /* we read from this file */

/* define YY_INPUT so we read from the FILE fin:
* This change makes it possible to use this scanner in
* the Cool compiler.
*/
#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \
YY_FATAL_ERROR( "read() in flex scanner failed");

char string_buf[MAX_STR_CONST]; /* to assemble string constants */
char *string_buf_ptr;

extern int curr_lineno;
extern int verbose_flag;

extern YYSTYPE cool_yylval;

/*
* Add Your own definitions here
*/

char str_const[MAX_STR_CONST];
int str_const_len;
int comments_num = 0;
bool null_flag;
bool long_flag;
%}

%option noyywrap
%x STRING COMMENTS LINE_COMMENTS
/*
* Define names for regular expressions here.
*/

DARROW =>
/* ASSIGN <- */

%%

/*
* Nested comments
*/

"(*" {
comments_num++;
BEGIN COMMENTS;
}
"--" {BEGIN LINE_COMMENTS;}
"*)" {
cool_yylval.error_msg = "Unmatched *)";
return ERROR;
}


<COMMENTS><<EOF>> {
cool_yylval.error_msg = "EOF in comment";
BEGIN 0;
return ERROR;
}

<COMMENTS>"\n" {curr_lineno++;}
<COMMENTS>\\. {}
<COMMENTS>"*)" {
comments_num--;
if (comments_num == 0){BEGIN 0;}
}
<COMMENTS>"(*" {
if (comments_num >= 0x7fffffff){
cool_yylval.error_msg = "Too many comments";
return ERROR;
}
comments_num++;
}
<COMMENTS>. {}

<LINE_COMMENTS>"\n" {
curr_lineno++;
BEGIN 0;
}
<LINE_COMMENTS>. {}

"\n" {curr_lineno++;} /*换行符,记录行数+1*/
[ \t\r\v\f]+ {} /*跳过whitespace空白*/

/*
* The multiple-character operators.
*/

{DARROW} { return (DARROW); }
"<-" { return (ASSIGN); }
"<=" { return (LE); }

/* 单字节的运算符
* The single-character operators.
*/
"{" {return '{';}
"}" {return '}';}
":" {return ':';}
"(" {return '(';}
")" {return ')';}
";" {return ';';}
"=" {return '=';}
"<" {return '<';}
"." {return '.';}
"," {return ',';}
"~" {return '~';}
"-" {return '-';}
"+" {return '+';}
"*" {return '*';}
"/" {return '/';}
"@" {return '@';}

/* 关键字
* Keywords are case-insensitive except for the values true and false,
* which must begin with a lower-case letter.
*/

/*
#define CLASS 258
#define ELSE 259
#define FI 260
#define IF 261
#define IN 262
#define INHERITS 263
#define LET 264
#define LOOP 265
#define POOL 266
#define THEN 267
#define WHILE 268
#define CASE 269
#define ESAC 270
#define OF 271
#define DARROW 272
#define NEW 273
#define ISVOID 274
#define STR_CONST 275
#define INT_CONST 276
#define BOOL_CONST 277
#define TYPEID 278
#define OBJECTID 279
#define ASSIGN 280
#define NOT 281
#define LE 282
#define ERROR 283
#define LET_STMT 285
*/
(?i:CLASS) {return CLASS;}
(?i:ELSE) {return ELSE;}
(?i:FI) {return FI;}
(?i:IF) {return IF;}
(?i:IN) {return IN;}
(?i:INHERITS) {return INHERITS;}
(?i:LET) {return LET;}
(?i:LOOP) {return LOOP;}
(?i:POOL) {return POOL;}
(?i:THEN) {return THEN;}
(?i:WHILE) {return WHILE;}
(?i:CASE) {return CASE;}
(?i:ESAC) {return ESAC;}
(?i:OF) {return OF;}
(?i:NEW) {return NEW;}
(?i:ISVOID) {return ISVOID;}
(?i:NOT) {return NOT;}

t(?i:rue) {
cool_yylval.boolean = true;
return BOOL_CONST;
}

f(?i:alse) {
cool_yylval.boolean = false;
return BOOL_CONST;
}
/*s
* String constants (C syntax)
* Escape sequence \c is accepted for all characters c. Except for
* \n \t \b \f, the result is c.
*
*/
\" {
memset(str_const, 0, sizeof(str_const));
str_const_len = 0;
null_flag = false;
long_flag = false;
BEGIN STRING;
}

<STRING><<EOF>> {
cool_yylval.error_msg = "EOF String!";
BEGIN 0;
return ERROR;
}

<STRING>\0 {
null_flag = true;
}

<STRING>\n {
curr_lineno++;
cool_yylval.error_msg = "Unterminated string constant";
BEGIN 0;
return ERROR;
}

<STRING>\\\n {
if (str_const_len >= MAX_STR_CONST - 1){
long_flag = true;
}
else{
str_const[str_const_len++] = '\n';
curr_lineno++;
}
}

<STRING>\\. {
if (str_const_len >= MAX_STR_CONST - 1){
long_flag = true;
}else{
switch(yytext[1]){
case '\"': str_const[str_const_len++] = '\"';break;
case '\\': str_const[str_const_len++] = '\\';break;
case 'b' : str_const[str_const_len++] = '\b';break;
case 'f' : str_const[str_const_len++] = '\f';break;
case 't' : str_const[str_const_len++] = '\t';break;
case 'n' : str_const[str_const_len++] = '\n';break;
case 0 : str_const[str_const_len++] = 0;null_flag = true;break;
default : str_const[str_const_len++] = yytext[1];
}
}

}

<STRING>\" {
BEGIN 0;
if (null_flag){
cool_yylval.error_msg = "String contains escaped null character";
return ERROR;
}
if (long_flag){
cool_yylval.error_msg = "String constant too long";
return ERROR;
}
cool_yylval.symbol = stringtable.add_string(str_const);
return STR_CONST;
}


<STRING>. {
if (str_const_len >= MAX_STR_CONST - 1){
long_flag = true;
}else{
str_const[str_const_len++] = yytext[0];
}
}

[0-9]+ {
cool_yylval.symbol = inttable.add_string(yytext);
return INT_CONST;
}


[A-Z][A-Za-z0-9_]* {
cool_yylval.symbol = idtable.add_string(yytext);
return TYPEID;
}


[a-z][A-Za-z0-9_]* {
cool_yylval.symbol = idtable.add_string(yytext);
return OBJECTID;
}

. {
cool_yylval.error_msg = yytext;
return ERROR;
}

%%

注释(comments)

首先是注释的处理部分

我们关注COOL的手册:

image.png

COOL的注释有()和—两种形式,分别为多行注释和单行注释

flex代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"(*"    {
comments_num++;
BEGIN COMMENTS;
} /* 块注释开始 */
"--" {BEGIN LINE_COMMENTS;} /* 单行注释开始 */
"*)" {
cool_yylval.error_msg = "Unmatched *)";
return ERROR;
}


<COMMENTS><<EOF>> {
cool_yylval.error_msg = "EOF in comment";
BEGIN 0;
return ERROR;
}

<COMMENTS>"\n" {curr_lineno++;} /* 注释中遇到换行 */
<COMMENTS>\\. {}
<COMMENTS>"*)" { /* 注释中遇到嵌套注释 */
comments_num--;
if (comments_num == 0){BEGIN 0;}
}
<COMMENTS>"(*" { /* 块注释结束标志 */
if (comments_num >= 0x7fffffff){
cool_yylval.error_msg = "Too many comments";
return ERROR;
}
comments_num++;
}
<COMMENTS>. {}

<LINE_COMMENTS>"\n" { /* 单行注释结束 */
curr_lineno++;
BEGIN 0;
}
<LINE_COMMENTS>. {}

空白(whitespace)

针对空白的处理如下,跳过空格、制表符、换页符等等

遇到换行符时,记录行号+1

1
2
"\n"            {curr_lineno++;} /*换行符,记录行数+1*/
[ \t\r\v\f]+ {} /*跳过whitespace空白*/

操作符(operator)

操作符有多个字节的操作符和单个字节的操作符,注意这里遵循最长匹配原则,长的操作符在前

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
DARROW          =>

......

{DARROW} { return (DARROW); }
"<-" { return (ASSIGN); }
"<=" { return (LE); }

/* 单字节的运算符
* The single-character operators.
*/
"{" {return '{';}
"}" {return '}';}
":" {return ':';}
"(" {return '(';}
")" {return ')';}
";" {return ';';}
"=" {return '=';}
"<" {return '<';}
"." {return '.';}
"," {return ',';}
"~" {return '~';}
"-" {return '-';}
"+" {return '+';}
"*" {return '*';}
"/" {return '/';}
"@" {return '@';}

关键字(keywords)

这里是匹配一些关键字,关键字在include/cool-parse.h下有定义

注意,(?i:string)的写法是匹配string且忽略大小写

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
 /* 关键字
* Keywords are case-insensitive except for the values true and false,
* which must begin with a lower-case letter.
*/

/*
#define CLASS 258
#define ELSE 259
#define FI 260
#define IF 261
#define IN 262
#define INHERITS 263
#define LET 264
#define LOOP 265
#define POOL 266
#define THEN 267
#define WHILE 268
#define CASE 269
#define ESAC 270
#define OF 271
#define DARROW 272
#define NEW 273
#define ISVOID 274
#define STR_CONST 275
#define INT_CONST 276
#define BOOL_CONST 277
#define TYPEID 278
#define OBJECTID 279
#define ASSIGN 280
#define NOT 281
#define LE 282
#define ERROR 283
#define LET_STMT 285
*/
(?i:CLASS) {return CLASS;}
(?i:ELSE) {return ELSE;}
(?i:FI) {return FI;}
(?i:IF) {return IF;}
(?i:IN) {return IN;}
(?i:INHERITS) {return INHERITS;}
(?i:LET) {return LET;}
(?i:LOOP) {return LOOP;}
(?i:POOL) {return POOL;}
(?i:THEN) {return THEN;}
(?i:WHILE) {return WHILE;}
(?i:CASE) {return CASE;}
(?i:ESAC) {return ESAC;}
(?i:OF) {return OF;}
(?i:NEW) {return NEW;}
(?i:ISVOID) {return ISVOID;}
(?i:NOT) {return NOT;}

标识符(Identifier)和常量

查看COOL手册中对bool常量的说明:

image.png

true和false的第一个字符须为小写,后三个字符可为大写也可为小写

再来就是,大写字符开头的变量识别为TYPEID,小写字符开头的变量识别为OBJECTID

这里比较复杂的是对于字符串的识别,要考虑到空字符、转义、引号、长度、换行等等等等

再加上对数字常量的识别,这个好写

当然最后别忘了错误处理,除上面所有外其他形式的字符

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
t(?i:rue) {
cool_yylval.boolean = true;
return BOOL_CONST;
}

f(?i:alse) {
cool_yylval.boolean = false;
return BOOL_CONST;
}

\" { /* 字符串常量开始 */
memset(str_const, 0, sizeof(str_const));
str_const_len = 0;
null_flag = false;
long_flag = false;
BEGIN STRING;
}

<STRING><<EOF>> {
cool_yylval.error_msg = "EOF String!";
BEGIN 0;
return ERROR;
}

<STRING>\0 { /* 字符串识别中遇到空字节 */
null_flag = true;
}

<STRING>\n { /* 字符串识别中遇到换行 */
curr_lineno++;
cool_yylval.error_msg = "Unterminated string constant";
BEGIN 0;
return ERROR;
}

<STRING>\\\n { /* 字符串识别中遇到加\的换行 */
if (str_const_len >= MAX_STR_CONST - 1){
long_flag = true;
}
else{
str_const[str_const_len++] = '\n';
curr_lineno++;
}
}

<STRING>\\. { /* 字符串识别中遇到转义字符\且后面还有字符 */
if (str_const_len >= MAX_STR_CONST - 1){
long_flag = true;
}else{
switch(yytext[1]){
case '\"': str_const[str_const_len++] = '\"';break;
case '\\': str_const[str_const_len++] = '\\';break;
case 'b' : str_const[str_const_len++] = '\b';break;
case 'f' : str_const[str_const_len++] = '\f';break;
case 't' : str_const[str_const_len++] = '\t';break;
case 'n' : str_const[str_const_len++] = '\n';break;
case 0 : str_const[str_const_len++] = 0;null_flag = true;break;
default : str_const[str_const_len++] = yytext[1];
}
}

}

<STRING>\" { /* 字符串识别中遇到引号,识别结束 */
BEGIN 0;
if (null_flag){
cool_yylval.error_msg = "String contains escaped null character";
return ERROR;
}
if (long_flag){
cool_yylval.error_msg = "String constant too long";
return ERROR;
}
cool_yylval.symbol = stringtable.add_string(str_const);
return STR_CONST;
}


<STRING>. { /* 字符串识别中遇到任意字符 */
if (str_const_len >= MAX_STR_CONST - 1){
long_flag = true;
}else{
str_const[str_const_len++] = yytext[0];
}
}

[0-9]+ { /* 整数常量 */
cool_yylval.symbol = inttable.add_string(yytext);
return INT_CONST;
}


[A-Z][A-Za-z0-9_]* { /* TYPEID */
cool_yylval.symbol = idtable.add_string(yytext);
return TYPEID;
}


[a-z][A-Za-z0-9_]* { /* OBJECTID */
cool_yylval.symbol = idtable.add_string(yytext);
return OBJECTID;
}

. { /* 其他字符 */
cool_yylval.error_msg = yytext;
return ERROR;
}

%%

参考

https://blog.csdn.net/CrazyHeroZK/article/details/87359818

http://doraemonzzz.com/2021/03/21/Stanford%20Compiler%20PA2

https://github.com/afterthat97/cool-compiler/

https://github.com/skyzluo/CS143-Compilers-Stanford/