python3.10.2
主要分析Parser/tokenizer.c的tok_get函数

一、预处理行首

1、跳过空白字符

1.1.1 空格、Tab、\014

static int
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
{.../* Get indentation level */if (tok->atbol) { //atbol => at begin of line, 在行开头int col = 0;int altcol = 0;tok->atbol = 0;for (;;) {//死循环，一直跳过空格、\t、\014c = tok_nextc(tok);if (c == ' ') {col++, altcol++;}else if (c == '\t') {col = (col / tok->tabsize + 1) * tok->tabsize;altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;}else if (c == '\014')  {/* Control-L (formfeed) */col = altcol = 0; /* For Emacs users */}else {//其他字符，退出forbreak;}}tok_backup(tok, c); //将正常的字符放回缓冲区中...

1.1.2 处理空白行

对于注释行、空行、以及续行处理，判断是否需要整行忽略

static int
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
{.../* Get indentation level */if (tok->atbol) { //atbol => at begin of line, 在行开头...if (c == '#' || c == '\n' || c == '\\') {/* Lines with only whitespace and/or commentsand/or a line continuation charactershouldn't affect the indentation and arenot passed to the parser as NEWLINE tokens,except *totally* empty lines in interactivemode, which signal the end of a command group. */if (col == 0 && c == '\n' && tok->prompt != NULL) {blankline = 0; /* Let it through */}else if (tok->prompt != NULL && tok->lineno == 1) {/* In interactive mode, if the first line containsonly spaces and/or a comment, let it through. */blankline = 0;col = altcol = 0;}else {blankline = 1; /* Ignore completely */}/* We can't jump back right here since we stillmay need to skip to the end of a comment */}...

1.1.3 填写标识符列位置

static int
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
{.../* Get indentation level */if (tok->atbol) {...//对非空白行以及level=0时，填写标识符的列信息if (!blankline && tok->level == 0) {if (col == tok->indstack[tok->indent]) {/* No change */if (altcol != tok->altindstack[tok->indent]) {return indenterror(tok);}}else if (col > tok->indstack[tok->indent]) {/* Indent -- always one */if (tok->indent+1 >= MAXINDENT) {tok->done = E_TOODEEP;tok->cur = tok->inp;return ERRORTOKEN;}if (altcol <= tok->altindstack[tok->indent]) {return indenterror(tok);}tok->pendin++;tok->indstack[++tok->indent] = col;tok->altindstack[tok->indent] = altcol;}else /* col < tok->indstack[tok->indent] */ {/* Dedent -- any number, must be consistent */while (tok->indent > 0 &&col < tok->indstack[tok->indent]) {tok->pendin--;tok->indent--;}if (col != tok->indstack[tok->indent]) {tok->done = E_DEDENT;tok->cur = tok->inp;return ERRORTOKEN;}if (altcol != tok->altindstack[tok->indent]) {return indenterror(tok);}}}

2、处理pendin

static int
tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
{...tok->start = tok->cur;/* Return pending indents/dedents */if (tok->pendin != 0) {if (tok->pendin < 0) {tok->pendin++;return DEDENT;}else {tok->pendin--;return INDENT;}}
...

3、预读一个字符

/* Peek ahead at the next character */c = tok_nextc(tok);tok_backup(tok, c);

4、处理async

 /* Check if we are closing an async function */if (tok->async_def&& !blankline/* Due to some implementation artifacts of type comments,* a TYPE_COMMENT at the start of a function won't set an* indentation level and it will produce a NEWLINE after it.* To avoid spuriously ending an async function due to this,* wait until we have some non-newline char in front of us. */&& c != '\n'&& tok->level == 0/* There was a NEWLINE after ASYNC DEF,so we're past the signature. */&& tok->async_def_nl/* Current indentation level is less than wherethe async function was defined */&& tok->async_def_indent >= tok->indent){tok->async_def = 0;tok->async_def_indent = 0;tok->async_def_nl = 0;}

二、识别行中的各个token

1、跳过空白字符

 again:tok->start = NULL;/* Skip spaces */do {c = tok_nextc(tok);} while (c == ' ' || c == '\t' || c == '\014');/* Set start of current token */tok->start = tok->cur - 1;

2、跳过注释

/* Skip comment, unless it's a type comment */if (c == '#') {const char *prefix, *p, *type_start;while (c != EOF && c != '\n') {c = tok_nextc(tok);}if (tok->type_comments) {p = tok->start;prefix = type_comment_prefix;while (*prefix && p < tok->cur) {if (*prefix == ' ') {while (*p == ' ' || *p == '\t') {p++;}} else if (*prefix == *p) {p++;} else {break;}prefix++;}/* This is a type comment if we matched all of type_comment_prefix. */if (!*prefix) {int is_type_ignore = 1;const char *ignore_end = p + 6;tok_backup(tok, c);  /* don't eat the newline or EOF */type_start = p;/* A TYPE_IGNORE is "type: ignore" followed by the end of the token* or anything ASCII and non-alphanumeric. */is_type_ignore = (tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0&& !(tok->cur > ignore_end&& ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));if (is_type_ignore) {*p_start = ignore_end;*p_end = tok->cur;/* If this type ignore is the only thing on the line, consume the newline also. */if (blankline) {tok_nextc(tok);tok->atbol = 1;}return TYPE_IGNORE;} else {*p_start = type_start;  /* after type_comment_prefix */*p_end = tok->cur;return TYPE_COMMENT;}}}}

3、结束检查

 /* Check for EOF and errors now */if (c == EOF) {if (tok->level) {return ERRORTOKEN;}return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;}

4、识别标识符

大小写字母，下划线，以及ASCII值大于等于128的字符开头，后跟大小写字母，下划线，以及ASCII值大于等于128的字符以及数字0-9

#define is_potential_identifier_start(c) (\(c >= 'a' && c <= 'z')\|| (c >= 'A' && c <= 'Z')\|| c == '_'\|| (c >= 128))#define is_potential_identifier_char(c) (\(c >= 'a' && c <= 'z')\|| (c >= 'A' && c <= 'Z')\|| (c >= '0' && c <= '9')\|| c == '_'\|| (c >= 128))

 /* Identifier (most frequent token!) */nonascii = 0;if (is_potential_identifier_start(c)) {/* Process the various legal combinations of b"", r"", u"", and f"". */int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;while (1) {if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))saw_b = 1;/* Since this is a backwards compatibility support literal we don'twant to support it in arbitrary order like byte literals. */else if (!(saw_b || saw_u || saw_r || saw_f)&& (c == 'u'|| c == 'U')) {saw_u = 1;}/* ur"" and ru"" are not supported */else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {saw_r = 1;}else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {saw_f = 1;}else {break;}c = tok_nextc(tok);if (c == '"' || c == '\'') {goto letter_quote;}}while (is_potential_identifier_char(c)) {if (c >= 128) {nonascii = 1;}c = tok_nextc(tok);}tok_backup(tok, c);if (nonascii && !verify_identifier(tok)) {return ERRORTOKEN;}*p_start = tok->start;*p_end = tok->cur;/* async/await parsing block. */if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {/* May be an 'async' or 'await' token.  For Python 3.7 orlater we recognize them unconditionally.  For Python3.5 or 3.6 we recognize 'async' in front of 'def', andeither one inside of 'async def'.  (Technically weshouldn't recognize these at all for 3.4 or earlier,but there's no *valid* Python 3.4 code that would berejected, and async functions will be rejected in alater phase.) */if (!tok->async_hacks || tok->async_def) {/* Always recognize the keywords. */if (memcmp(tok->start, "async", 5) == 0) {return ASYNC;}if (memcmp(tok->start, "await", 5) == 0) {return AWAIT;}}else if (memcmp(tok->start, "async", 5) == 0) {/* The current token is 'async'.Look ahead one token to see if that is 'def'. */struct tok_state ahead_tok;const char *ahead_tok_start = NULL;const char *ahead_tok_end = NULL;int ahead_tok_kind;memcpy(&ahead_tok, tok, sizeof(ahead_tok));ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,&ahead_tok_end);if (ahead_tok_kind == NAME&& ahead_tok.cur - ahead_tok.start == 3&& memcmp(ahead_tok.start, "def", 3) == 0){/* The next token is going to be 'def', so instead ofreturning a plain NAME token, return ASYNC. */tok->async_def_indent = tok->indent;tok->async_def = 1;return ASYNC;}}}return NAME;}

5、识别换行

/* Newline */if (c == '\n') {tok->atbol = 1;if (blankline || tok->level > 0) {goto nextline;}*p_start = tok->start;*p_end = tok->cur - 1; /* Leave '\n' out of the string */tok->cont_line = 0;if (tok->async_def) {/* We're somewhere inside an 'async def' function, andwe've encountered a NEWLINE after its signature. */tok->async_def_nl = 1;}return NEWLINE;}

6、识别DOT

/* Period or number starting with period? */if (c == '.') {c = tok_nextc(tok);if (isdigit(c)) {goto fraction;} else if (c == '.') {c = tok_nextc(tok);if (c == '.') {*p_start = tok->start;*p_end = tok->cur;return ELLIPSIS;}else {tok_backup(tok, c);}tok_backup(tok, '.');}else {tok_backup(tok, c);}*p_start = tok->start;*p_end = tok->cur;return DOT;}

7、识别数值

/* Number */if (isdigit(c)) {if (c == '0') {/* Hex, octal or binary -- maybe. */c = tok_nextc(tok);if (c == 'x' || c == 'X') {/* Hex */c = tok_nextc(tok);do {if (c == '_') {c = tok_nextc(tok);}if (!isxdigit(c)) {tok_backup(tok, c);return syntaxerror(tok, "invalid hexadecimal literal");}do {c = tok_nextc(tok);} while (isxdigit(c));} while (c == '_');if (!verify_end_of_number(tok, c, "hexadecimal")) {return ERRORTOKEN;}}else if (c == 'o' || c == 'O') {/* Octal */c = tok_nextc(tok);do {if (c == '_') {c = tok_nextc(tok);}if (c < '0' || c >= '8') {if (isdigit(c)) {return syntaxerror(tok,"invalid digit '%c' in octal literal", c);}else {tok_backup(tok, c);return syntaxerror(tok, "invalid octal literal");}}do {c = tok_nextc(tok);} while ('0' <= c && c < '8');} while (c == '_');if (isdigit(c)) {return syntaxerror(tok,"invalid digit '%c' in octal literal", c);}if (!verify_end_of_number(tok, c, "octal")) {return ERRORTOKEN;}}else if (c == 'b' || c == 'B') {/* Binary */c = tok_nextc(tok);do {if (c == '_') {c = tok_nextc(tok);}if (c != '0' && c != '1') {if (isdigit(c)) {return syntaxerror(tok,"invalid digit '%c' in binary literal", c);}else {tok_backup(tok, c);return syntaxerror(tok, "invalid binary literal");}}do {c = tok_nextc(tok);} while (c == '0' || c == '1');} while (c == '_');if (isdigit(c)) {return syntaxerror(tok,"invalid digit '%c' in binary literal", c);}if (!verify_end_of_number(tok, c, "binary")) {return ERRORTOKEN;}}else {int nonzero = 0;/* maybe old-style octal; c is first char of it *//* in any case, allow '0' as a literal */while (1) {if (c == '_') {c = tok_nextc(tok);if (!isdigit(c)) {tok_backup(tok, c);return syntaxerror(tok, "invalid decimal literal");}}if (c != '0') {break;}c = tok_nextc(tok);}char* zeros_end = tok->cur;if (isdigit(c)) {nonzero = 1;c = tok_decimal_tail(tok);if (c == 0) {return ERRORTOKEN;}}if (c == '.') {c = tok_nextc(tok);goto fraction;}else if (c == 'e' || c == 'E') {goto exponent;}else if (c == 'j' || c == 'J') {goto imaginary;}else if (nonzero) {/* Old-style octal: now disallowed. */tok_backup(tok, c);return syntaxerror_known_range(tok, (int)(tok->start + 1 - tok->line_start),(int)(zeros_end - tok->line_start),"leading zeros in decimal integer ""literals are not permitted; ""use an 0o prefix for octal integers");}if (!verify_end_of_number(tok, c, "decimal")) {return ERRORTOKEN;}}}else {/* Decimal */c = tok_decimal_tail(tok);if (c == 0) {return ERRORTOKEN;}{/* Accept floating point numbers. */if (c == '.') {c = tok_nextc(tok);fraction:/* Fraction */if (isdigit(c)) {c = tok_decimal_tail(tok);if (c == 0) {return ERRORTOKEN;}}}if (c == 'e' || c == 'E') {int e;exponent:e = c;/* Exponent part */c = tok_nextc(tok);if (c == '+' || c == '-') {c = tok_nextc(tok);if (!isdigit(c)) {tok_backup(tok, c);return syntaxerror(tok, "invalid decimal literal");}} else if (!isdigit(c)) {tok_backup(tok, c);if (!verify_end_of_number(tok, e, "decimal")) {return ERRORTOKEN;}tok_backup(tok, e);*p_start = tok->start;*p_end = tok->cur;return NUMBER;}c = tok_decimal_tail(tok);if (c == 0) {return ERRORTOKEN;}}if (c == 'j' || c == 'J') {/* Imaginary part */imaginary:c = tok_nextc(tok);if (!verify_end_of_number(tok, c, "imaginary")) {return ERRORTOKEN;}}else if (!verify_end_of_number(tok, c, "decimal")) {return ERRORTOKEN;}}}tok_backup(tok, c);*p_start = tok->start;*p_end = tok->cur;return NUMBER;}

8、识别string

/* String */if (c == '\'' || c == '"') {int quote = c;int quote_size = 1;             /* 1 or 3 */int end_quote_size = 0;/* Nodes of type STRING, especially multi line stringsmust be handled differently in order to get boththe starting line number and the column offset right.(cf. issue 16806) */tok->first_lineno = tok->lineno;tok->multi_line_start = tok->line_start;/* Find the quote size and start of string */c = tok_nextc(tok);if (c == quote) {c = tok_nextc(tok);if (c == quote) {quote_size = 3;}else {end_quote_size = 1;     /* empty string found */}}if (c != quote) {tok_backup(tok, c);}/* Get rest of string */while (end_quote_size != quote_size) {c = tok_nextc(tok);if (c == EOF || (quote_size == 1 && c == '\n')) {assert(tok->multi_line_start != NULL);// shift the tok_state's location into// the start of string, and report the error// from the initial quote charactertok->cur = (char *)tok->start;tok->cur++;tok->line_start = tok->multi_line_start;int start = tok->lineno;tok->lineno = tok->first_lineno;if (quote_size == 3) {return syntaxerror(tok,"unterminated triple-quoted string literal"" (detected at line %d)", start);}else {return syntaxerror(tok,"unterminated string literal (detected at"" line %d)", start);}}if (c == quote) {end_quote_size += 1;}else {end_quote_size = 0;if (c == '\\') {tok_nextc(tok);  /* skip escaped char */}}}*p_start = tok->start;*p_end = tok->cur;return STRING;}

9、识别多字符token

识别如==，–,+=等

    /* Check for two-character token */{int c2 = tok_nextc(tok);int token = PyToken_TwoChars(c, c2);if (token != OP) {int c3 = tok_nextc(tok);int token3 = PyToken_ThreeChars(c, c2, c3);if (token3 != OP) {token = token3;}else {tok_backup(tok, c3);}*p_start = tok->start;*p_end = tok->cur;return token;}tok_backup(tok, c2);}

10、处理括号嵌套

通过栈操作确定括号是否配对。

 /* Keep track of parentheses nesting level */switch (c) {case '(':case '[':case '{':if (tok->level >= MAXLEVEL) {return syntaxerror(tok, "too many nested parentheses");}tok->parenstack[tok->level] = c;tok->parenlinenostack[tok->level] = tok->lineno;tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);tok->level++;break;case ')':case ']':case '}':if (!tok->level) {return syntaxerror(tok, "unmatched '%c'", c);}tok->level--;int opening = tok->parenstack[tok->level];if (!((opening == '(' && c == ')') ||(opening == '[' && c == ']') ||(opening == '{' && c == '}'))){if (tok->parenlinenostack[tok->level] != tok->lineno) {return syntaxerror(tok,"closing parenthesis '%c' does not match ""opening parenthesis '%c' on line %d",c, opening, tok->parenlinenostack[tok->level]);}else {return syntaxerror(tok,"closing parenthesis '%c' does not match ""opening parenthesis '%c'",c, opening);}}break;}/* Punctuation character */*p_start = tok->start;*p_end = tok->cur;return PyToken_OneChar(c);

python3词法分析（三）识别token相关推荐

atitit.词法分析的实现token attilax总结
atitit.词法分析的实现token attilax总结 1. 词法分析(英语:lexical analysis)跟token 1 1.1. 扫描器 2 2. 单词流必须识别为保留字,标识符(变量) ...
Python3 图片文字识别翻译——调用百度AI、百度翻译和有道翻译的API
文章目录 Python3 图片文字识别翻译--调用百度AI.百度翻译和有道翻译的API 一.演示二. API准备三. 图片文字识别--调用百度AI文字识别API 四. 文字翻译 1. 百度翻译请 ...
python3中三个点是啥意思?
python3中三个点是啥意思? 问题引入前段时间看 fastapi 代码的时候, 看类型注解的部分经常出现 ... 这样的东西,我就不理解这是什么意思呢? 比如像下面这个参数的类型注解 ...
人工智能python3+tensorflow人脸识别_机器学习tensorflow object detection 实现人脸识别...
object detection是Tensorflow很常用的api,功能强大,很有想象空间,人脸识别,花草识别,物品识别等.下面是我做实验的全过程,使用自己收集的胡歌图片,实现人脸识别,找出胡歌. ...
Python3 生成和识别二维码
文章目录问题描述代码一点解释 [写在前面] 在当今社会中,二维码的使用非常的广泛.当然作为一名充满好奇心又爱装逼的 coder 怎么能不会使用二维码编码自己想表达的内容呢,尤其是用在装逼和找对象 ...
在CentOS上安装Python3的三种方法
Centos7默认自带了Python2.7版本,但是因为项目需要使用Python3.x你可以按照此文的三个方法进行安装. 注:本文示例安装版本为Python3.5, 一.Python源代码编译安装安 ...
平板安装python_在CentOS上安装Python3的三种方法
Centos7默认自带了Python2.7版本,但是因为项目需要使用Python3.x你可以按照此文的三个方法进行安装. 注:本文示例安装版本为Python3.5, 一.Python源代码编译安装安 ...
linux安装python3.7的步骤_centos7安装python3 的三种方式
Centos7默认自带了Python2.7版本,但是因为项目需要使用Python3.x你可以按照此文的三个方法进行安装. 安装必要工具 yum-utils ,它的功能是管理repository及扩展包 ...
python3 ocr_python3 ocr 识别图片文字（CSDN验证码90%通过）
[实例简介] 上次下载了个pytesser_v0.0.1,它是 python2的,我给改成python3的.并且做成子目录下的模块吧. 这样不显得目录乱. 放在开发目录下就能用了 ocr. impor ...

python3词法分析（三）识别token

一、预处理行首

1、跳过空白字符

1.1.1 空格、Tab、\014

1.1.2 处理空白行

1.1.3 填写标识符列位置

2、处理pendin

3、预读一个字符

4、处理async

二、识别行中的各个token

1、跳过空白字符

2、跳过注释

3、结束检查

4、识别标识符

5、识别换行

6、识别DOT

7、识别数值

8、识别string

9、识别多字符token

10、处理括号嵌套

python3词法分析（三）识别token相关推荐

最新文章

热门文章