手写词法分析器(三)
学习目标
通过本集的学习,你将能够:
- 学会处理空白字符
- 学会处理注释
- 学会处理字符串字面量
- 掌握错误处理
1. 处理空白字符
1.1 空白字符的种类
空白字符包括空格、制表符、换行符等,它们通常不影响程序的语义,只是用来格式化代码。
空白字符:
- 空格 (space)
- 制表符 (tab, \t)
- 换行符 (newline, \n)
- 回车符 (carriage return, \r)
- 换页符 (form feed, \f)1.2 跳过空白字符
def skip_whitespace(self):
while self.current_char is not None and self.current_char.isspace():
self.advance()1.3 在主循环中使用
def get_next_token(self):
while self.current_char is not None:
if self.current_char.isspace():
self.skip_whitespace()
continue
# ... 处理其他token2. 处理注释
2.1 注释的种类
单行注释:
// 这是单行注释
# 这也是单行注释
多行注释:
/* 这是
多行
注释 */
'''
这也是多行注释
'''2.2 单行注释处理
def skip_single_line_comment(self):
while self.current_char is not None and self.current_char != '\n':
self.advance()
# 如果有换行符,也跳过它
if self.current_char == '\n':
self.advance()2.3 多行注释处理
def skip_multi_line_comment(self):
# 假设注释以 /* 开始,以 */ 结束
self.advance() # 跳过 *
if self.current_char == '/':
self.advance() # 跳过 /
while self.current_char is not None:
if self.current_char == '*':
self.advance()
if self.current_char == '/':
self.advance()
return
else:
self.advance()
raise Exception('Unterminated multi-line comment')2.4 完整的注释处理
def skip_comment(self):
if self.current_char == '/':
self.advance()
if self.current_char == '/':
# 单行注释
self.skip_single_line_comment()
elif self.current_char == '*':
# 多行注释
self.skip_multi_line_comment()
else:
# 不是注释,回退一个字符
self.pos -= 1
self.current_char = self.source[self.pos]3. 处理字符串字面量
3.1 字符串的格式
字符串通常用单引号或双引号括起来。
字符串示例:
"Hello, World!"
'Hello, World!'
"带\"转义\"字符的字符串"3.2 字符串的状态机
状态转换图:
开始
│
▼
" 或 ' ──→ 字符串状态 ──┐
│ │
├─ \ ──→ 转义状态 ────┤
│ │
└─ " 或 ' ──→ 结束 │
│
└───────────────────┘3.3 代码实现
def string(self):
result = ''
quote_char = self.current_char # 记住是单引号还是双引号
self.advance()
while self.current_char is not None and self.current_char != quote_char:
if self.current_char == '\\':
# 处理转义字符
self.advance()
if self.current_char == 'n':
result += '\n'
elif self.current_char == 't':
result += '\t'
elif self.current_char == '\\':
result += '\\'
elif self.current_char == '\'':
result += '\''
elif self.current_char == '\"':
result += '\"'
else:
result += self.current_char
else:
result += self.current_char
self.advance()
if self.current_char != quote_char:
raise Exception('Unterminated string')
self.advance() # 跳过结束引号
return ('STRING', result)4. 错误处理
4.1 错误类型
常见的词法错误:
1. 无效字符
2. 未终止的字符串
3. 未终止的注释
4. 无效的数字格式4.2 错误报告
def error(self, message):
line, col = self.get_position()
raise Exception(f'{message} at line {line}, column {col}')
def get_position(self):
line = 1
col = 1
for i in range(self.pos):
if self.source[i] == '\n':
line += 1
col = 1
else:
col += 1
return line, col4.3 错误恢复
在词法分析中,遇到错误时我们可以尝试跳过错误字符继续分析。
def get_next_token(self):
while self.current_char is not None:
try:
if self.current_char.isspace():
self.skip_whitespace()
continue
elif self.current_char in '"\'':
return self.string()
elif self.current_char == '/' and (
self.peek() == '/' or self.peek() == '*'
):
self.skip_comment()
continue
elif self.current_char.isalpha() or self.current_char == '_':
return self.identifier()
elif self.current_char.isdigit():
return self.number()
elif self.current_char in '+-*/=<>!(){};,':
return self.operator_or_punctuator()
else:
self.error(f'Invalid character: {self.current_char}')
except Exception as e:
print(f'Warning: {e}')
# 跳过当前字符继续
self.advance()
return ('EOF', None)5. 完整的词法分析器
5.1 最终版本
class Lexer:
def __init__(self, source):
self.source = source
self.pos = 0
self.current_char = self.source[0] if source else None
self.KEYWORDS = {
'if': 'IF',
'else': 'ELSE',
'while': 'WHILE',
'for': 'FOR',
'return': 'RETURN'
}
def advance(self):
self.pos += 1
if self.pos < len(self.source):
self.current_char = self.source[self.pos]
else:
self.current_char = None
def peek(self):
if self.pos + 1 < len(self.source):
return self.source[self.pos + 1]
return None
def get_position(self):
line = 1
col = 1
for i in range(self.pos):
if self.source[i] == '\n':
line += 1
col = 1
else:
col += 1
return line, col
def error(self, message):
line, col = self.get_position()
raise Exception(f'{message} at line {line}, column {col}')
def skip_whitespace(self):
while self.current_char is not None and self.current_char.isspace():
self.advance()
def skip_single_line_comment(self):
while self.current_char is not None and self.current_char != '\n':
self.advance()
if self.current_char == '\n':
self.advance()
def skip_multi_line_comment(self):
self.advance()
if self.current_char == '*':
self.advance()
while self.current_char is not None:
if self.current_char == '*':
self.advance()
if self.current_char == '/':
self.advance()
return
else:
self.advance()
self.error('Unterminated multi-line comment')
def skip_comment(self):
self.advance()
if self.current_char == '/':
self.skip_single_line_comment()
elif self.current_char == '*':
self.skip_multi_line_comment()
else:
self.pos -= 1
self.current_char = self.source[self.pos]
def string(self):
result = ''
quote_char = self.current_char
self.advance()
while self.current_char is not None and self.current_char != quote_char:
if self.current_char == '\\':
self.advance()
if self.current_char == 'n':
result += '\n'
elif self.current_char == 't':
result += '\t'
elif self.current_char == '\\':
result += '\\'
elif self.current_char == '\'':
result += '\''
elif self.current_char == '\"':
result += '\"'
else:
result += self.current_char
else:
result += self.current_char
self.advance()
if self.current_char != quote_char:
self.error('Unterminated string')
self.advance()
return ('STRING', result)
def number(self):
result = ''
while self.current_char is not None and self.current_char.isdigit():
result += self.current_char
self.advance()
if self.current_char == '.':
result += '.'
self.advance()
while self.current_char is not None and self.current_char.isdigit():
result += self.current_char
self.advance()
return ('FLOAT', float(result))
else:
return ('INTEGER', int(result))
def identifier(self):
result = ''
while (self.current_char is not None and
(self.current_char.isalnum() or
self.current_char == '_')):
result += self.current_char
self.advance()
if result in self.KEYWORDS:
return (self.KEYWORDS[result], result)
else:
return ('IDENTIFIER', result)
def operator_or_punctuator(self):
char = self.current_char
self.advance()
if char == '=' and self.current_char == '=':
self.advance()
return ('OPERATOR', '==')
elif char == '!' and self.current_char == '=':
self.advance()
return ('OPERATOR', '!=')
elif char == '<' and self.current_char == '=':
self.advance()
return ('OPERATOR', '<=')
elif char == '>' and self.current_char == '=':
self.advance()
return ('OPERATOR', '>=')
elif char == '+':
if self.current_char == '=':
self.advance()
return ('OPERATOR', '+=')
else:
return ('OPERATOR', '+')
elif char == '-':
if self.current_char == '=':
self.advance()
return ('OPERATOR', '-=')
else:
return ('OPERATOR', '-')
elif char == '*':
if self.current_char == '=':
self.advance()
return ('OPERATOR', '*=')
else:
return ('OPERATOR', '*')
elif char == '/':
if self.current_char == '=':
self.advance()
return ('OPERATOR', '/=')
else:
return ('OPERATOR', '/')
elif char == '(':
return ('LPAREN', '(')
elif char == ')':
return ('RPAREN', ')')
elif char == '{':
return ('LBRACE', '{')
elif char == '}':
return ('RBRACE', '}')
elif char == ';':
return ('SEMICOLON', ';')
elif char == ',':
return ('COMMA', ',')
else:
return ('OPERATOR', char)
def get_next_token(self):
while self.current_char is not None:
if self.current_char.isspace():
self.skip_whitespace()
continue
elif self.current_char == '/' and self.peek() in '/':
self.skip_comment()
continue
elif self.current_char in '"\'':
return self.string()
elif self.current_char.isalpha() or self.current_char == '_':
return self.identifier()
elif self.current_char.isdigit():
return self.number()
elif self.current_char in '+-*/=<>!(){};,':
return self.operator_or_punctuator()
else:
self.error(f'Invalid character: {self.current_char}')
return ('EOF', None)5.2 测试完整词法分析器
# 测试代码
source = '''
// 这是一个测试程序
if (x == 10) {
/* 这是多行
注释 */
return "Hello, World!" + x;
}
'''
lexer = Lexer(source)
while True:
token = lexer.get_next_token()
print(token)
if token[0] == 'EOF':
break输出:
('IF', 'if')
('LPAREN', '(')
('IDENTIFIER', 'x')
('OPERATOR', '==')
('INTEGER', 10)
('RPAREN', ')')
('LBRACE', '{')
('RETURN', 'return')
('STRING', 'Hello, World!')
('OPERATOR', '+')
('IDENTIFIER', 'x')
('SEMICOLON', ';')
('RBRACE', '}')
('EOF', None)6. 实用案例
6.1 案例1:处理浮点数
def number(self):
result = ''
while self.current_char is not None and self.current_char.isdigit():
result += self.current_char
self.advance()
if self.current_char == '.':
result += '.'
self.advance()
while self.current_char is not None and self.current_char.isdigit():
result += self.current_char
self.advance()
return ('FLOAT', float(result))
else:
return ('INTEGER', int(result))6.2 案例2:更好的错误恢复
def get_next_token(self):
while self.current_char is not None:
try:
# 正常处理
if self.current_char.isspace():
self.skip_whitespace()
continue
elif self.current_char in '"\'':
return self.string()
elif self.current_char.isalpha() or self.current_char == '_':
return self.identifier()
elif self.current_char.isdigit():
return self.number()
elif self.current_char in '+-*/=<>!(){};,':
return self.operator_or_punctuator()
else:
self.error(f'Invalid character: {self.current_char}')
except Exception as e:
print(f'Lexical error: {e}')
# 跳过到下一个空白字符或分号
while (self.current_char is not None and
not self.current_char.isspace() and
self.current_char != ';'):
self.advance()
if self.current_char:
self.advance()
return ('EOF', None)7. 自测问题
- 如何处理空白字符?
- 如何处理单行注释和多行注释?
- 如何处理字符串字面量中的转义字符?
- 词法分析中常见的错误有哪些?
- 什么是错误恢复?
8. 下集预告
下一集我们将学习有限自动机入门!
参考资料
- 《编译原理》(龙书)
- 《现代编译原理》(虎书)