词法分析器：一个例子

Lộc Nguyễn

4.71/5 (11投票s)

2014年10月27日

CPOL

3分钟阅读

191829

5940

一个扫描源文件以获取令牌的 C 程序

下载源代码

→ 在阅读完本文后，您可能想看看语法分析：一个例子。

引言

词法分析器（或扫描器）是一个从输入源文件（或源代码）中识别标记（也称为符号）的程序。每个标记都是一个有意义的字符串，例如数字、运算符或标识符。

这是作业：编写一个遵循以下词法规则的扫描器

不区分大小写
所有英文字母（大小写）、数字和以下所示的额外字符，以及空格
标识符
- 以字母开头，并以任意数量的字母继续
- 假设没有标识符的长度超过 8 个字符
关键字（保留字），包括：start finish then if repeat var int float do read print void return dummy program
关系运算符，包括： == < > =!= => =<
其他运算符，包括： = : + - * / %
分隔符，包括： . ( ) , { } ; [ ]
数字
- 任何十进制数字序列，没有符号
- 假设没有数字的长度超过 8 位
其他假设
- 除了改变标记的情况（例如 x y 与 xy），不需要空格来分隔标记
- 注释以 // 开头，以换行符结尾

显示找到的标记，每个标记有 3 个字段：标记类型、标记实例和行号。

其他要求

程序的调用方式是
scanner [文件名]
其中输入源文件是 filename.lan (扩展名 .lan 是隐式的)。
如果没有指定文件名，程序将通过从文件重定向从键盘读取相同的数据。例如，scanner < filename.lan
执行必要的验证。例如，如果发现无效符号，则显示带有行号的错误消息，然后中止程序

准备工作

计划

这是 Makefile，也显示了程序文件的组织结构

CC	= gcc
PROG	= scanner 

$(PROG): driveScanner.o scanner.o
	$(CC) -o $(PROG) driveScanner.o scanner.o

driveScanner.o : driveScanner.c token.h scanner.h
	$(CC) -c driveScanner.c

scanner.o : scanner.c token.h scanner.h symdef.h
	$(CC) -c scanner.c

.PHONY: clean

clean:
	/usr/bin/rm -rf core $(PROG) *.o

token.h 定义了 MAX (= 8，每个单词字符串的最大长度)，LIMIT (= 200，输入源文件中单词字符串的最大数量)，并声明了标记类型（稍后显示）。

处理命令行参数和重定向

从 driveScanner.c 中的 main() 开始

int main(int argc, char *argv[]) {
    FILE *filePtr;

    switch (argc) {
        case 1: // No parameters, use stdin
                // printf("NO argument provided\n");
                filePtr = stdin;
                break;

        case 2: // One parameter, use .lan file supplied	
                if ( (filePtr = fopen(strcat(argv[1], ".lan"), "r")) == NULL ) {
                        printf("Cannot open input file.\n");
                        exit(1);
                }
                break;

        default:
                printf("Syntax: scanner [file] (.lan is implicit)\n");
                exit(0);
    }
}

接下来检查输入文件是否为空

fseek(filePtr, 0, SEEK_END);
if (ftell(filePtr) == 0) {
        printf("File is empty.\n");
        exit(1);
} else {
        rewind(filePtr);
}

现在检查无效字符和最大单词长度

char c;
int numLine = 1;

int charCount = 0;
char tempStr[MAX]; // ! Remember: C doesn't do array out-of-bound checking! 
int i;

int isValid = 1; // true 
while ((c = fgetc(filePtr)) != EOF) {
        if (c == '\n')
                numLine++;

        // Exclude comment line starting with '//' and ending with new line
        if (c == '/') {
                if (fgetc(filePtr) == '/') {
                        while ((c = fgetc(filePtr)) != '\n') {}
                        numLine++;
                }			
        }

        if (isalnum(c)) {
                tempStr[charCount] = c; 
                charCount++;
                if (charCount > MAX) {
                        printf("Word '%s' on line number %d is too long. \n", tempStr, numLine);
                        exit(1);	
                }
        } else if (isspace(c) || isExAcceptableChar(c)) { 
                charCount = 0;
        } else {
                printf("Invalid character '%c' at line %d. \n", c, numLine);
                isValid = 0; // false
        }

}

if (isValid == 0) { 
        printf("Something wrong with the input file. \n");
        exit(1);
}

rewind(filePtr);

此时，文件是好的。现在让 scanner.c 完成工作

TokenType tokenType = UNDEF;

while ((tokenType = getTokenType(filePtr)) != EOT) {}

fclose(filePtr); 
return 0; // done main and program

让我们看一下 token.h 中的类型声明

typedef enum {
	IDENTIFIER,
	KEYWORD,
	NUMBER,
	REL_OP, 	// such as ==  <  >  =!=    =>  =<
	OP,			// such as = :  +  -  *  / %
	DELIM,		// such as . (  ) , { } ; [ ] 

	UNDEF,		// undefined
	EOT 		// end of token

} TokenType;

typedef struct {
	TokenType tokenType;
	char *instance;
	int lineNum;

} Token;

请注意，我尚未按照预期使用 TokeType 枚举和 Token 结构体。在处理此作业时，首先我声明并打算将 Token 结构体用作“三元组”数据类型（TokenType 枚举、实例和行号）。但后来这些变得不再需要，如下所示。

此时，其余的工作现在是扫描器的责任。除了 token.h 之外，我创建了另一个头文件来支持 scanner.c 的工作。

symdef.h 根据作业要求定义以下内容，以及扫描器使用的其他几个局部数组来存储找到的标记

char *keywords[] = {"start", "finish", "then", "if", "repeat", "var", 
	"int", "float", "do", "read", "print", "void", "return", "dummy", "program"};	

char *relationalOperators[] = {"==", "<", ">", "=!=", "=>", "=<"};

char otherOperators[] = {':', '+', '-', '*', '/', '%'};

char delimiters[] = {'.', '(', ')', ',', '{', '}', ';', '[', ']'};

char words[LIMIT][MAX]; // include identifiers, and keywords
int wordi = 0, wordj = 0;
int wordLineNums[LIMIT];

char keys[LIMIT][MAX]; // to store keywords
int keyi = 0;
int keyLineNums[LIMIT];
   
char idens[LIMIT][MAX]; // to store identifiers
int ideni = 0;
int idenLineNums[LIMIT];

char nums[LIMIT][MAX];  // to store numbers
int numi = 0, numj = 0;
int numLineNums[LIMIT];

char delims[LIMIT]; // to store delimiters
int delimi = 0;
int delimLineNums[LIMIT];

char otherOps[LIMIT]; // to store other operators
int otherOpi = 0;
int otherOpLineNums[LIMIT];

char relOps[LIMIT][MAX]; // to store keywords
int relOpi = 0, relOpj = 0;
int relOpLineNums[LIMIT];

scanner.c

除了英文字母和数字，这些是额外的可接受字符

int isExAcceptableChar(char c) {
	if (c == '.' || c == '(' || c == ')' || c == ',' || c =='{' || c == '}' ||
		c == ';' || c == '[' || c == ']' || 
		c == ':' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || 
		c == '=' || c == '<' || c == '>' || c == '!') {

		return 1;
	} else 
		return 0;
}

scanner.c 的关键函数如下

TokenType getTokenType(FILE *filePtr) {
	int lineNum = 1;	
	char ch;
	
	while ((ch = fgetc(filePtr)) != EOF) {
		if (ch == '\n') {
			lineNum++;
		}

		// Ignore comment starting with // to the end of line
		if (ch == '/') {
			if (fgetc(filePtr) == '/') {
				while ((ch = fgetc(filePtr)) != '\n') {}
				lineNum++;
			} else
				fseek(filePtr, -1, SEEK_CUR);
		}

		if (isalpha(ch)) {
			words[wordi][wordj++] = ch;	
			while (isalpha(ch = fgetc(filePtr))) {
				words[wordi][wordj++] = ch;	
			}
			words[wordi][wordj] = '\0';	
			wordLineNums[wordi] = lineNum;
			wordi++; wordj = 0;	
			fseek(filePtr, -1, SEEK_CUR);
		} 

		else if (isdigit(ch)) {
			nums[numi][numj++] = ch;
			while (isdigit(ch = fgetc(filePtr))) {
				nums[numi][numj++] = ch;
			}
			nums[numi][numj] = '\0';
			numLineNums[numi] = lineNum;
			numi++; numj = 0;
			fseek(filePtr, -1, SEEK_CUR);
		}

		else if (ispunct(ch)) {
			if (isDelimiter(ch)) {
				delims[delimi] = ch;
				delimLineNums[delimi] = lineNum;
				delimi++;
			} 
			else if (isOtherOperators(ch)) {
				otherOps[otherOpi] = ch;
				otherOpLineNums[otherOpi] = lineNum;
				otherOpi++;
			}
			else if (isStartRelationalOperator(ch)) {
				if (ch == '<' || ch == '>') {
					relOps[relOpi][relOpj++] = ch;
					relOps[relOpi][relOpj] = '\0';
					relOpLineNums[relOpi] = lineNum;
					relOpi++; relOpj = 0;
				}
				else if (ch == '=') {
					if ((ch = fgetc(filePtr)) == '=' || ch == '>' || ch == '<') {
						relOps[relOpi][relOpj++] = '=';	
						relOps[relOpi][relOpj++] = ch;	
						relOps[relOpi][relOpj] = '\0';
						relOpLineNums[relOpi] = lineNum;
						relOpi++; relOpj = 0;
					} else if (ch == '!') {
						if ((ch = fgetc(filePtr)) == '=') {
							relOps[relOpi][relOpj++] = '=';
							relOps[relOpi][relOpj++] = '!';
							relOps[relOpi][relOpj++] = ch;	
							relOps[relOpi][relOpj] = '\0';
							relOpLineNums[relOpi] = lineNum;
							relOpi++; relOpj = 0;
						} else {
							fseek(filePtr, -1, SEEK_CUR);
						}
					} else {
						fseek(filePtr, -1, SEEK_CUR);
					}
			
				}
			}
		}
	} // end while

	splitWords();	

	printSummary();
	
	return EOT; // end of token
}

注意
- 想法是进行“超前一个字符”以决定下一个标记是什么。
- fseek(filePtr, -1, SEEK_CUR) 用于在需要时回退一个字符。
- 二维数组 words 用于“收集”关键字和标识符标记。然后将此二维数组（字符数组，或等效的字符串一维数组）拆分为关键字和标识符标记数组。
- 通过“一次一个字符”前进和后退来找到运算符和数字标记。
- 这些是在输入源文件中来回移动时有帮助的一些检查函数

int isStartRelationalOperator(char c) {
	int i;
	int result = 0; // false
	if (c == '=' || c == '<' || c == '>') {
		result = 1;
	}
	return result;	
}

int isOtherOperators(char c) {
	 int i;
	 int result = 0; // false
	 for (i = 0; i < 6; i++) {
		if (otherOperators[i] == c) 
			result = 1;
	 }
	 return result;
}

int isDelimiter(char c) {
	 int i;
	 int result = 0; // false
	 for (i = 0; i < 9; i++) {
		if (delimiters[i] == c) 
			result = 1;
	 }
	 return result;
}

int isKeyword(char *str) {
	int i;
	int result = 0; // false
	for (i = 0; i < 15; i++) {
		if (!strcasecmp(keywords[i], str))
			result = 1;
	}
	return result;
}

注意：数字 6、9、15（预先指定的其他运算符、分隔符和关键字的数量）被硬编码，这不应该。它们应该以类似于 token.h 中 MAX 或 LIMIT 的方式定义。

现在查看 words 数组，并为关键字和标识符标记数组（keys 数组和 idens 数组）选择标记

void splitWords() {
	int i;
	for (i = 0; i < wordi; i++) {
		if (isKeyword(words[i])) {
			strcpy(keys[keyi], words[i]);
			keyLineNums[keyi] =  wordLineNums[i];
			keyi++;
		} else {
			strcpy(idens[ideni], words[i]);
			idenLineNums[ideni] = wordLineNums[i];
			ideni++;

		}
	}
}

最后一步是显示这些标记数组（nums、keys、idens、delims、relOps 和 otherOps）。我包含示例文本输入源文件 (file.lan) 如下

// first keywords separated by spaces then by delimiters then by operators
start finish then if repeat var int float do read print void return dummy program
start.finish(then)if,repeat{var}int;float[do]start==finishif=!=repeat=>var<=int=float:do!read+print-void*return/dummy%program

// now some identifiers and numbers separated by operators or delimiters or WS

x.xyz abc123 abc 123 -123

这是我的输出

→ 在阅读完本文后，您可能想看看语法分析：一个例子。