自己动手写C语言编译器（3）

词法分析器部分完成。

支持：

1.支持单词分割

2.支持数字类型

3.支持字符串

4.支持换行

6.支持注释

不支持：

1.不支持关键字

2.不支持变量。

3.不支持关键字。

4.不支操作符。

偶没有被那些个编译原理课程所吓倒。。。。。真的勇士，只管前行！

#ifndef _ISTREAMTOKENIZER_H_
#define _ISTREAMTOKENIZER_H_
#include <limits.h>
#include <string>
#include <istream>
#include <vector>
#define _COUNT_OF(a) (sizeof(a)/sizeof(a[0]))
class IstreamTokenizer
{
private:
/**
* The next character to be considered by the nextToken method.  May also
* be NEED_CHAR to indicate that a new character should be read, or SKIP_LF
* to indicate that a new character should be read and, if it is a '\n'
* character, it should be discarded and a second new character should be
* read.
*/
static const int SKIP_LF;
static const int NEED_CHAR;
//字符类型
static const unsigned char CT_WHITESPACE;
static const unsigned char CT_DIGIT;
static const unsigned char CT_ALPHA;
static const unsigned char CT_QUOTE;
static const unsigned char CT_COMMENT;
public:
//token类型
static const int TT_EOF;
static const int TT_EOL;
static const int TT_NUMBER;
static const int TT_WORD;
static const int TT_NOTHING;
private:
std::istream& input;
std::vector<char> buf;
int peekc;
bool pushedBack;
bool forceLower;
int LINENO;
bool eolIsSignificantP;
bool slashSlashCommentsP;
bool slashStarCommentsP;
unsigned char ctype[256];
public:
std::string sval;
double nval;
int ttype;
private:
void init()
{
wordChars('a', 'z');
wordChars('A', 'Z');
wordChars(128 + 32, 255);
whitespaceChars(0, ' ');
commentChar('/');
quoteChar('"');
quoteChar('\'');
parseNumbers();
}
public:
IstreamTokenizer(std::istream& is): input(is), peekc(NEED_CHAR)
{
init();
}
void resetSyntax()
{
for (int i = _COUNT_OF(ctype); --i >= 0;)
ctype[i] = 0;
}
void wordChars(int low, int hi)
{
if (low < 0)
low = 0;
if (hi >= _COUNT_OF(ctype))
hi = _COUNT_OF(ctype) - 1;
while (low <= hi)
ctype[low++] |= CT_ALPHA;
}
void whitespaceChars(int low, int hi)
{
if (low < 0)
low = 0;
if (hi >= _COUNT_OF(ctype))
hi = _COUNT_OF(ctype) - 1;
while (low <= hi)
ctype[low++] = CT_WHITESPACE;
}
void ordinaryChars(int low, int hi)
{
if (low < 0)
low = 0;
if (hi >= _COUNT_OF(ctype))
hi = _COUNT_OF(ctype) - 1;
while (low <= hi)
ctype[low++] = 0;
}
void ordinaryChar(int ch)
{
if (ch >= 0 && ch < _COUNT_OF(ctype))
ctype[ch] = 0;
}
void commentChar(int ch)
{
if (ch >= 0 && ch < _COUNT_OF(ctype))
ctype[ch] = CT_COMMENT;
}
void quoteChar(int ch)
{
if (ch >= 0 && ch < _COUNT_OF(ctype))
ctype[ch] = CT_QUOTE;
}
void parseNumbers()
{
for (int i = '0'; i <= '9'; i++)
ctype[i] |= CT_DIGIT;
ctype['.'] |= CT_DIGIT;
ctype['-'] |= CT_DIGIT;
}
/**
* Determines whether or not ends of line are treated as tokens.
* If the flag argument is true, this tokenizer treats end of lines
* as tokens; the <code>nextToken</code> method returns
* <code>TT_EOL</code> and also sets the <code>ttype</code> field to
* this value when an end of line is read.
* <p>
* A line is a sequence of characters ending with either a
* carriage-return character (<code>'\r'</code>) or a newline
* character (<code>'\n'</code>). In addition, a carriage-return
* character followed immediately by a newline character is treated
* as a single end-of-line token.
* <p>
* If the <code>flag</code> is false, end-of-line characters are
* treated as white space and serve only to separate tokens.
*
* @param   flag   <code>true</code> indicates that end-of-line characters
*                 are separate tokens; <code>false</code> indicates that
*                 end-of-line characters are white space.
* @see     java.io.StreamTokenizer#nextToken()
* @see     java.io.StreamTokenizer#ttype
* @see     java.io.StreamTokenizer#TT_EOL
*/
void eolIsSignificant(bool flag)
{
eolIsSignificantP = flag;
}
void slashStarComments(bool flag)
{
slashStarCommentsP = flag;
}
void slashSlashComments(bool flag)
{
slashSlashCommentsP = flag;
}
void lowerCaseMode(bool fl)
{
forceLower = fl;
}
/** Read the next character */
private:
int read()
{
return input.get();
}
int nextToken() {
if (pushedBack) {
pushedBack = false;
return ttype;
}
unsigned char* ct = ctype;
int c = peekc;
if (c < 0)
c = NEED_CHAR;
if (c == SKIP_LF) {
c = read();
if (c < 0)
return ttype = TT_EOF;
if (c == '\n')
c = NEED_CHAR;
}
if (c == NEED_CHAR) {
c = read();
if (c < 0)
return ttype = TT_EOF;
}
ttype = c;     /* Just to be safe */
/* Set peekc so that the next invocation of nextToken will read
* another character unless peekc is reset in this invocation
*/
peekc = NEED_CHAR;
int ctype = c < 256 ? ct[c] : CT_ALPHA;
while ((ctype & CT_WHITESPACE) != 0) {
if (c == '\r') {
LINENO++;
if (eolIsSignificantP)
{
//end of line 作为结束的标识。
peekc = SKIP_LF;
return ttype = TT_EOL;
}
c = read();
if (c == '\n')
c = read();
} else {
if (c == '\n') {
LINENO++;
if (eolIsSignificantP) {
//end of line 作为结束的标识。
return ttype = TT_EOL;
}
}
c = read();
}
if (c < 0)
return ttype = TT_EOF;
ctype = c < 256 ? ct[c] : CT_ALPHA;
}
if ((ctype & CT_DIGIT) != 0) {
bool neg = false;
if (c == '-') {
c = read();
if (c != '.' && (c < '0' || c > '9')) {
peekc = c;
return ttype = '-';
}
neg = true;
}
double v = 0;
int decexp = 0;
int seendot = 0;
while (true) {
if (c == '.' && seendot == 0)
seendot = 1;
else if ('0' <= c && c <= '9') {
v = v * 10 + (c - '0');
decexp += seendot;
} else
break;
c = read();
}
peekc = c;
if (decexp != 0) {
double denom = 10;
decexp--;
while (decexp > 0) {
denom *= 10;
decexp--;
}
/* Do one division of a likely-to-be-more-accurate number */
v = v / denom;
}
nval = neg ? -v : v;
return ttype = TT_NUMBER;
}
if ((ctype & CT_ALPHA) != 0) {
int i = 0;
do {
if (i >= buf.size()) {
buf.resize(buf.size()*2);
}
buf[i++] = (char) c;
c = read();
ctype = c < 0 ? CT_WHITESPACE : c < 256 ? ct[c] : CT_ALPHA;
} while ((ctype & (CT_ALPHA | CT_DIGIT)) != 0);
peekc = c;
sval.resize(i, 0);
std::copy(buf.begin(), buf.end(), sval.begin());
return ttype = TT_WORD;
}
if ((ctype & CT_QUOTE) != 0) {
ttype = c;
int i = 0;
int d = read();
while (d >= 0 && d != ttype && d != '\n' && d != '\r')
{
if (d == '\\') {
c = read();
int first = c;   /* To allow \377, but not \477 */
if (c >= '0' && c <= '7') {
c = c - '0';
int c2 = read();
if ('0' <= c2 && c2 <= '7') {
c = (c << 3) + (c2 - '0');
c2 = read();
if ('0' <= c2 && c2 <= '7' && first <= '3') {
c = (c << 3) + (c2 - '0');
d = read();
} else
d = c2;
} else
d = c2;
} else {
switch (c) {
case 'a':
c = 0x7;
break;
case 'b':
c = '\b';
break;
case 'f':
c = 0xC;
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
case 'v':
c = 0xB;
break;
}
d = read();
}
} else {
c = d;
d = read();
}
if (i >= buf.size()) {
buf.resize(buf.size()*2);
}
buf[i++] = (char)c;
}
/* If we broke out of the loop because we found a matching quote
* character then arrange to read a new character next time
* around; otherwise, save the character.
*/
peekc = (d == ttype) ? NEED_CHAR : d;
buf.resize(i);
std::copy(buf.begin(), buf.end(), sval.begin());
return ttype;
}
if (c == '/' && (slashSlashCommentsP || slashStarCommentsP)) {
c = read();
if (c == '*' && slashStarCommentsP) {
int prevc = 0;
while ((c = read()) != '/' || prevc != '*') {
if (c == '\r') {
LINENO++;
c = read();
if (c == '\n') {
c = read();
}
} else {
if (c == '\n') {
LINENO++;
c = read();
}
}
if (c < 0)
return ttype = TT_EOF;
prevc = c;
}
return nextToken();
} else if (c == '/' && slashSlashCommentsP) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
} else {
/* Now see if it is still a single line comment */
if ((ct['/'] & CT_COMMENT) != 0) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
} else {
peekc = c;
return ttype = '/';
}
}
}
if ((ctype & CT_COMMENT) != 0) {
while ((c = read()) != '\n' && c != '\r' && c >= 0);
peekc = c;
return nextToken();
}
return ttype = c;
}
void pushBack() {
if (ttype != TT_NOTHING)
pushedBack = true;
}
int lineno() {
return LINENO;
}
std::string toString();
};
const unsigned char IstreamTokenizer::CT_WHITESPACE = 1;
const unsigned char IstreamTokenizer::CT_DIGIT = 2;
const unsigned char IstreamTokenizer::CT_ALPHA = 4;
const unsigned char IstreamTokenizer::CT_QUOTE = 8;
const unsigned char IstreamTokenizer::CT_COMMENT = 16;
const int IstreamTokenizer::NEED_CHAR = INT_MAX;
const int IstreamTokenizer::SKIP_LF = INT_MAX - 1;
#endif

自己动手写C语言编译器（3）相关推荐

自己动手写C语言编译器（5）
依托Bison强大的工具, 写编译器,先用编译原理搞个计算器玩玩.如下: Bison工具: http://coolshell.cn/articles/1547.html http://www.gnu. ...
自己动手写C语言编译器（4）
Statement由Expression构成,Expression由Token构成,Token由char构成. 从上到下呈现树形结构. 程序是由statement组成的,其实我们要的就是一种判断Sta ...
自己动手写C语言编译器（暂停）
1.开源的东西其实并不好用. 2.需要更多的关注Android开发和linux的工具的使用. 3.llvm编译出错的问题一直没有解决. 4.也许需要亲自与llvm的官方进行沟通.这是一个庞大的任务.
自己动手写C语言编译器（2）
直接上代码 : 支持:左右结合性 // MyCompiler.cpp : Defines the entry point for the console application. // #includ ...
自己动手写C语言编译器（1）
直接上代码 (表达式树在此初具模型 ) 由于没有做前期的"词法分析", 1.支持单个字符形式的变量 2.支持单个字符形式的操作符 3.支持优先级 . 不支持 1 ...
java c语言 for_Java能写C语言编译器吗
用java是可以写语言编译器的,用任何语言工具写都可以.各语言开发工具间只有开发效率的差异,没有可行不可行的区别. 编译器其实就是一个翻译工具,它可以把代码直接翻译成二进制文件交给CPU执行(二进制指 ...
自己动手制作C 语言编译器（8）：表达式
这是整个编译器的最后一部分,解析表达式.什么是表达式?表达式是将各种语言要素的一个组合,用来求值.例如:函数调用.变量赋值.运算符运算等等. 表达式的解析难点有二:一是运算符的优先级问题,二是如何将表 ...
自己动手制作C 语言编译器（7）：语句
整个编译器还剩下最后两个部分:语句和表达式的解析.它们的内容比较多,主要涉及如何将语句和表达式编译成汇编代码.这章讲解语句的解析,相对于表达式来说它还是较为容易的. 语句 C 语言区分"语句 ...
keil c语言编译运行,Keil的c语言编译器
我曾经通过查看反汇编代码对KEILC编译器进行了测试,大概有这么一下内容,也得出一些结论. (1)全局变量:如果程序中定义了全局变量,而且初始值不是0.此时,在程序调到main()函数执行前,除了要进 ...

自己动手写C语言编译器（3）

自己动手写C语言编译器（3）相关推荐

最新文章

热门文章