赞
踩
例子如下
$ cat hello.py
def hello():
print("hello world")
if __name__ == "__main__":
hello()
源码字符流 ==> 词法分析程序 ==> token流
python3.10.2 词法分析主要 Parser/tokenizer.c
主要的结构体struct tok_state
/* Tokenizer state */ struct tok_state { /* Input state; buf <= cur <= inp <= end */ /* NB an entire line is held in the buffer */ char *buf; /* Input buffer, or NULL; malloc'ed if fp != NULL */ char *cur; /* Next character in buffer */ char *inp; /* End of data in buffer */ int fp_interactive; /* If the file descriptor is interactive */ char *interactive_src_start; /* The start of the source parsed so far in interactive mode */ char *interactive_src_end; /* The end of the source parsed so far in interactive mode */ const char *end; /* End of input buffer if buf != NULL */ const char *start; /* Start of current token if not NULL */ int done; /* E_OK normally, E_EOF at EOF, otherwise error code */ /* NB If done != E_OK, cur must be == inp!!! */ FILE *fp; /* Rest of input; NULL if tokenizing a string */ int tabsize; /* Tab spacing */ int indent; /* Current indentation index */ int indstack[MAXINDENT]; /* Stack of indents */ int atbol; /* Nonzero if at begin of new line */ int pendin; /* Pending indents (if > 0) or dedents (if < 0) */ const char *prompt, *nextprompt; /* For interactive prompting */ int lineno; /* Current line number */ int first_lineno; /* First line of a single line or multi line string expression (cf. issue 16806) */ int level; /* () [] {} Parentheses nesting level */ /* Used to allow free continuations inside them */ char parenstack[MAXLEVEL]; int parenlinenostack[MAXLEVEL]; int parencolstack[MAXLEVEL]; PyObject *filename; /* Stuff for checking on different tab sizes */ int altindstack[MAXINDENT]; /* Stack of alternate indents */ /* Stuff for PEP 0263 */ enum decoding_state decoding_state; int decoding_erred; /* whether erred in decoding */ char *encoding; /* Source encoding. */ int cont_line; /* whether we are in a continuation line. */ const char* line_start; /* pointer to start of current line */ const char* multi_line_start; /* pointer to start of first line of a single line or multi line string expression (cf. issue 16806) */ PyObject *decoding_readline; /* open(...).readline */ PyObject *decoding_buffer; const char* enc; /* Encoding for the current str. */ char* str; /* Source string being tokenized (if tokenizing from a string)*/ char* input; /* Tokenizer's newline translated copy of the string. */ int type_comments; /* Whether to look for type comments */ /* async/await related fields (still needed depending on feature_version) */ int async_hacks; /* =1 if async/await aren't always keywords */ int async_def; /* =1 if tokens are inside an 'async def' body. */ int async_def_indent; /* Indentation level of the outermost 'async def'. */ int async_def_nl; /* =1 if the outermost 'async def' had at least one NEWLINE token after it. */ /* How to proceed when asked for a new token in interactive mode */ enum interactive_underflow_t interactive_underflow; };
/* Create and initialize a new tok_state structure */ static struct tok_state * tok_new(void) { struct tok_state *tok = (struct tok_state *)PyMem_Malloc( sizeof(struct tok_state)); if (tok == NULL) return NULL; tok->buf = tok->cur = tok->inp = NULL; tok->fp_interactive = 0; tok->interactive_src_start = NULL; tok->interactive_src_end = NULL; tok->start = NULL; tok->end = NULL; tok->done = E_OK; tok->fp = NULL; tok->input = NULL; tok->tabsize = TABSIZE; tok->indent = 0; tok->indstack[0] = 0; tok->atbol = 1; tok->pendin = 0; tok->prompt = tok->nextprompt = NULL; tok->lineno = 0; tok->level = 0; tok->altindstack[0] = 0; tok->decoding_state = STATE_INIT; tok->decoding_erred = 0; tok->enc = NULL; tok->encoding = NULL; tok->cont_line = 0; tok->filename = NULL; tok->decoding_readline = NULL; tok->decoding_buffer = NULL; tok->type_comments = 0; tok->async_hacks = 0; tok->async_def = 0; tok->async_def_indent = 0; tok->async_def_nl = 0; tok->interactive_underflow = IUNDERFLOW_NORMAL; tok->str = NULL; return tok; }
struct tok_state * PyTokenizer_FromFile(FILE *fp, const char* enc, const char *ps1, const char *ps2) { struct tok_state *tok = tok_new(); if (tok == NULL) return NULL; if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { PyTokenizer_Free(tok); return NULL; } tok->cur = tok->inp = tok->buf; tok->end = tok->buf + BUFSIZ; tok->fp = fp; tok->prompt = ps1; tok->nextprompt = ps2; if (enc != NULL) { /* Must copy encoding declaration since it gets copied into the parse tree. */ tok->encoding = new_string(enc, strlen(enc), tok); if (!tok->encoding) { PyTokenizer_Free(tok); return NULL; } tok->decoding_state = STATE_NORMAL; } return tok; }
初始化后,创建了一个8K的缓冲区,用于读取源代码进行处理
词法分析入口函数
int
PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
{
int result = tok_get(tok, p_start, p_end);
if (tok->decoding_erred) {
result = ERRORTOKEN;
tok->done = E_DECODE;
}
return result;
}
其中tok_get
函数是真正进行词法分析的处理函数。
调用tok_nextc
获取一个一个的字符进行识别出不同的token。
static int tok_nextc(struct tok_state *tok) { int rc; for (;;) { if (tok->cur != tok->inp) {//第一次不会进来,buf中还没有数据,一会将读取部分数据到缓冲区中,后续就可以从buf中直接读取 return Py_CHARMASK(*tok->cur++); /* Fast path */ } if (tok->done != E_OK) return EOF; if (tok->fp == NULL) { //单纯的字符串,直接读取字符串中的字符 rc = tok_underflow_string(tok); } else if (tok->prompt != NULL) { //交互模式下,直接从终端获取的字符串中获取字符 rc = tok_underflow_interactive(tok); } else { //从文件中读取 rc = tok_underflow_file(tok); } ... if (!rc) {//读取结束 tok->cur = tok->inp; return EOF; } tok->line_start = tok->cur; } Py_UNREACHABLE(); }
static int tok_underflow_file(struct tok_state *tok) { if (tok->start == NULL) { tok->cur = tok->inp = tok->buf; } if (tok->decoding_state == STATE_INIT) { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer reader functions from now on. */ if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { error_ret(tok); return 0; } assert(tok->decoding_state != STATE_INIT); } /* Read until '\n' or EOF */ if (tok->decoding_readline != NULL) { /* We already have a codec associated with this input. */ if (!tok_readline_recode(tok)) { return 0; } } else { /* We want a 'raw' read. */ if (!tok_readline_raw(tok)) { return 0; } } if (tok->inp == tok->cur) { tok->done = E_EOF; return 0; } if (tok->inp[-1] != '\n') { /* Last line does not end in \n, fake one */ *tok->inp++ = '\n'; *tok->inp = '\0'; } tok->lineno++; if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; } else if (!check_coding_spec(tok->cur, strlen(tok->cur), tok, fp_setreadl)) { return 0; } } /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { error_ret(tok); return 0; } assert(tok->done == E_OK); return tok->done == E_OK; }
因为tok->decoding_state == STATE_INIT
将进入check_bom
函数进行检查是否为bom格式,如果开头是0xEF 0xBB 0x0xBF,则将使用utf-8编码进行转码。解析状态进入STATE_SEEK_CODING状态。
/* See whether the file starts with a BOM. If it does, invoke the set_readline function with the new encoding. Return 1 on success, 0 on failure. */ static int check_bom(int get_char(struct tok_state *), void unget_char(int, struct tok_state *), int set_readline(struct tok_state *, const char *), struct tok_state *tok) { int ch1, ch2, ch3; ch1 = get_char(tok); tok->decoding_state = STATE_SEEK_CODING; if (ch1 == EOF) { return 1; } else if (ch1 == 0xEF) { ch2 = get_char(tok); if (ch2 != 0xBB) { unget_char(ch2, tok); unget_char(ch1, tok); return 1; } ch3 = get_char(tok); if (ch3 != 0xBF) { unget_char(ch3, tok); unget_char(ch2, tok); unget_char(ch1, tok); return 1; } } else { unget_char(ch1, tok); return 1; } if (tok->encoding != NULL) PyMem_Free(tok->encoding); tok->encoding = new_string("utf-8", 5, tok); if (!tok->encoding) return 0; /* No need to set_readline: input is already utf-8 */ return 1; }
tok_readline_raw
static int tok_readline_raw(struct tok_state *tok) { do { //扩容,如果需要 if (!tok_reserve_buf(tok, BUFSIZ)) { return 0; } //读取一行数据 char *line = Py_UniversalNewlineFgets(tok->inp, (int)(tok->end - tok->inp), tok->fp, NULL); if (line == NULL) { return 1; } if (tok->fp_interactive && tok_concatenate_interactive_new_line(tok, line) == -1) { return 0; } tok->inp = strchr(tok->inp, '\0'); if (tok->inp == tok->buf) { return 0; } } while (tok->inp[-1] != '\n'); return 1; }
Py_UniversalNewlineFgets
函数读取一行数据,并且将\r或\r\n替换为\n
文件最后一行可能没有\n,如果没有则需要添加
static int
tok_underflow_file(struct tok_state *tok) {
...
if (tok->inp[-1] != '\n') {
/* Last line does not end in \n, fake one */
*tok->inp++ = '\n';
*tok->inp = '\0';
}
...
}
从文件开头两行中检查编码,现在decoding_state的状态为STATE_SEEK_CODING, 并且lineno为1, 所以将进入到具体的编码检测中。
static int tok_underflow_file(struct tok_state *tok) { ... tok->lineno++; if (tok->decoding_state != STATE_NORMAL) { if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; } else if (!check_coding_spec(tok->cur, strlen(tok->cur), tok, fp_setreadl)) { return 0; } } ... }
/* Check whether the line contains a coding spec. If it does, invoke the set_readline function for the new encoding. This function receives the tok_state and the new encoding. Return 1 on success, 0 on failure. */ static int check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)) { char *cs; if (tok->cont_line) { /* It's a continuation line, so it can't be a coding spec. */ tok->decoding_state = STATE_NORMAL; return 1; } if (!get_coding_spec(line, &cs, size, tok)) { return 0; } if (!cs) { Py_ssize_t i; for (i = 0; i < size; i++) { if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') break; if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { /* Stop checking coding spec after a line containing * anything except a comment. */ tok->decoding_state = STATE_NORMAL; break; } } return 1; } tok->decoding_state = STATE_NORMAL; if (tok->encoding == NULL) { assert(tok->decoding_readline == NULL); if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); PyMem_Free(cs); return 0; } tok->encoding = cs; } else { /* then, compare cs with BOM */ if (strcmp(tok->encoding, cs) != 0) { error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s with BOM", cs); PyMem_Free(cs); return 0; } PyMem_Free(cs); } return 1; }
其中函数get_coding_spec
从单行的注释中搜索coding:xxx 或者coding=xxx
decoding_state进入STATE_NORMAL状态
如果找到编码方式xxx, 并且不等于utf-8,则调用回调函数set_readline设置编码
因为hello.py中第一行或者第二行中并未指定编码方式,所以默认是utf8编码,并且不是BOM编码的文件,所以tok->encoding=NULL。
static int
tok_underflow_file(struct tok_state *tok) {
...
/* The default encoding is UTF-8, so make sure we don't have any
non-UTF-8 sequences in it. */
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
error_ret(tok);
return 0;
}
...
}
static int ensure_utf8(char *line, struct tok_state *tok) { int badchar = 0; unsigned char *c; int length; for (c = (unsigned char *)line; *c; c += length) { if (!(length = valid_utf8(c))) { badchar = *c; break; } } if (badchar) { /* Need to add 1 to the line number, since this line has not been counted, yet. */ PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " "but no encoding declared; " "see https://python.org/dev/peps/pep-0263/ for details", badchar, tok->filename, tok->lineno + 1); return 0; } return 1; }
/* Check whether the characters at s start a valid UTF-8 sequence. Return the number of characters forming the sequence if yes, 0 if not. */ static int valid_utf8(const unsigned char* s) { int expected = 0; int length; if (*s < 0x80) /* single-byte code */ return 1; if (*s < 0xc0) /* following byte */ return 0; if (*s < 0xE0) expected = 1; else if (*s < 0xF0) expected = 2; else if (*s < 0xF8) expected = 3; else return 0; length = expected + 1; for (; expected; expected--) if (s[expected] < 0x80 || s[expected] >= 0xC0) return 0; return length; }
当一行读取到缓冲区buf中后,结构图如下
现在tok->cur != tok->inp成立,直接取出当前字符。
static int
tok_nextc(struct tok_state *tok)
{
int rc;
for (;;) {
if (tok->cur != tok->inp) {
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
...
}
...
}
/* Argument must be a char or an int in [-128, 127] or [0, 255]. */
#define Py_CHARMASK(c) ((unsigned char)((c) & 0xff))
比如现在取出字符’d’,‘e’,'f’后的结构
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。