编译原理实践(一)

为了巩固在编译原理课程上的内容,我们会尝试写一个“编译器”前端,即包括:词法分析,语义分析,语法分析等。直白地说就是写一个自己的脚本语言,可以跑起来之类的。本笔记记录的是脚本语言:MengNan的创建过程,该语言运行在java上。

词法解析

词法解析的作用是告诉编译器代码中的某个单词是什么东西,比如:

int cat = 5;

这段代码在编译器看来应该是这样的:

int id = number;

其中id是变量的专有标记而number很明显用于统一表示数字,注意这里的数字不止整型,还可以:

int cat = 9.666;

带小数的也可以被认为是number

这里额外提一下,这个number的识别靠的是正则表达式:

[0-9]+(.[0-9]+)?

这个式子可以筛选数字

代码

Lexer.java
package Lexer;

import Exception.*;
import Token.*;

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Lexer {
    public static String regexPattern = "\\s*((//.*)|(if|else|while)|([0-9]+(.[0-9]+)?)|(\"(\\\\\"|\\\\\\\\|\\\\n|[^\"])*\")|([A-Z_a-z][A-Z_a-z0-9]*)|(\\+\\+|--)|(==|[+\\-*/=><%])|([;(){}]))?";

    private Pattern pattern = Pattern.compile(regexPattern);
    private ArrayList<Token> queue = new ArrayList<>();
    private boolean hasMore;
    private LineNumberReader reader;
    private HashMap<String,Object> glossary;

    public Lexer(Reader reader){
        hasMore = true;
        this.reader = new LineNumberReader(reader);
        glossary = new HashMap<>();
    }
    public Token read() throws ParseException {
        if(fillQueue(0))
            return queue.remove(0);
        else
            return Token.EOF;
    }
    public Token peak() throws ParseException {
        if(fillQueue(1))
            return queue.get(1);
        else
            return Token.EOF;
    }
    private boolean fillQueue(int i) throws ParseException {
        while (i >= queue.size())
        {
            if(hasMore)
                readLine();
            else
                return false;
        }
        return true;            //一行的处理结果没有读完,读完再处理下一行
    }
    private void readLine() throws ParseException {
        String line;
        try {
            line = reader.readLine();
        }catch (IOException io){
            throw  new ParseException(io);
        }
        if(line == null){
            hasMore = false;
            return;
        }
        int lineNum = reader.getLineNumber();
        Matcher matcher = pattern.matcher(line);
        matcher.useTransparentBounds(true).useAnchoringBounds(false);
        int pos = 0;
        int endPos = line.length();
        while (pos < endPos){
            matcher.region(pos,endPos);
            //从目标字符串开始位置进行匹配。只有在有匹配且匹配的某一子串中包含目标字符串第一个字符的情况下才会返回true。
            if(matcher.lookingAt()){    
                addToken(lineNum,matcher);      //遇到正则表达式认识的单词,康康是哪个
                pos = matcher.end();            //滑动窗口往前走,把刚刚认识的单词踢出去
            }else
                throw new ParseException("wrong token at line" + lineNum);
        }
        queue.add(new SplitToken(lineNum, Token.EOL));
    }
    private void addToken(int num, Matcher matcher){
        String m = matcher.group(1);
        if(m != null){      //非空格
            if(matcher.group(2) == null){       //非注解,如果是注解直接跳过
                Token token = null;
                if(matcher.group(3) != null)    //这个token是关键字
                    token = new KeywordToken(num, m);
                else if(matcher.group(4) != null)       //是数字
                    token = new NumToken(num, Double.parseDouble(m));
                else if(matcher.group(6) != null)       //是字符串
                    token = new StrToken(num, beautifyString(m));
                else if(matcher.group(8) != null){      //是变量
                    token = new IdToken(num,m);
                    this.glossary.put(m, null);         //变量放到词汇表中
                }
                else if(matcher.group(9) != null)       //是操作符
                    token = new OperToken(num,m);
                else if(matcher.group(10) != null)      //是双目操作符
                    token = new DoubleOperToken(num,m);
                else if(matcher.group(11) != null)      //是分隔符,比如;{}()
                    token = new SplitToken(num,m);
                queue.add(token);       //将解析处来的token加入队列
            }
        }
    }

    private String beautifyString(String s){
        StringBuilder builder = new StringBuilder();
        int len = s.length() - 1;
        for(int i = 1; i < len; i++){       //从1开始是因为舍去第一个双引号
            char c = s.charAt(i);
            if(c == '\\' && i + 1 < len){
                char c2 = s.charAt(i + 1);
                if(c2 == '"' || c2 == '\\'){
                    c = s.charAt(++i);
                }else if(c2 == 'n'){
                    i++;
                    c = '\n';
                }
            }
            builder.append(c);
        }
        return builder.toString();
    }

    public void showGlossary(){
        System.out.println(this.glossary);
    }

    public HashMap<String, Object> getGlossary(){
        return glossary;
    }
}

Token.java
package Token;

public class Token {
    public static final Token EOF = new Token(-1){};
    public static final String EOL = "EOL";
    private int lineNumber;
    public Token(int line){
        lineNumber = line;
    }
    public String getType(){return "none";}
    public boolean isKeyword(){return false;}
    public boolean isOperator(){return false;}
    public boolean isId(){return false;}
    public boolean isNumber(){return false;}
    public boolean isString(){return false;}
    public String getText(){return "";}
    public void setTypeToText(){}
    public int getLineNumber() {
        return lineNumber;
    }
}
KeywordToken.java
package Token;

public class KeywordToken extends Token {
    private String key;
    private String type = "keyword";
    public KeywordToken(int line,String key) {
        super(line);
        this.key = key;
    }

    @Override
    public String getType() {
        return type;
    }

    @Override
    public boolean isKeyword() {
        return true;
    }

    @Override
    public String getText() {
        return key;
    }

}

其他的token类与上面展示的类似

bootstrap.java
public class bootstrap {
    public static void main(String[] args) throws ParseException, IOException {
        Lexer lexer = new Lexer(new FileReader("D:/code.txt"));
        for (Token t; (t = lexer.read()) != Token.EOF;)
            System.out.println(" === > " + t.getText() + "  " + t.getType());
    }
}
code.txt
ppp = 5 + 5;
if(ppp > 55){
    ppp = "hello world";
}else{
    ppp = "bye";
}
flag = 1
while(flag > 0){

}

运行结果

 === > ppp  id
 === > =  double_operator
 === > 5.0  number
 === > +  double_operator
 === > 5.0  number
 === > ;  split
 === > EOL  split
 === > if  keyword
 === > (  split
 === > ppp  id
 === > >  double_operator
 === > 55.0  number
 === > )  split
 === > {  split
 === > EOL  split
 === > ppp  id
 === > =  double_operator
 === > hello world  string
 === > ;  split
 === > EOL  split
 === > }  split
 === > else  keyword
 === > {  split
 === > EOL  split
 === > ppp  id
 === > =  double_operator
 === > bye  string
 === > ;  split
 === > EOL  split
 === > }  split
 === > EOL  split
 === > flag  id
 === > =  double_operator
 === > 1.0  number
 === > EOL  split
 === > while  keyword
 === > (  split
 === > flag  id
 === > >  double_operator
 === > 0.0  number
 === > )  split
 === > {  split
 === > EOL  split
 === > EOL  split
 === > }  split
 === > EOL  split

Process finished with exit code 0

发布者

我乃堂堂SCUT的一条咸鱼!

发表评论

电子邮件地址不会被公开。 必填项已用*标注