编译器实践(二)

本语言的文法如下:

programme -> statement blank | expr ; blank | repeat blank | function blank
statement -> if ( expr ) blank block blank magic
magic -> else blank block | yimixilu
block -> { blank programme_multi }
expr -> factor right | true | false | exec right | array right | c_shuzu
right -> double_operator expr | yimixilu
factor -> - primary | primary
primary -> ( expr ) | number | id | string
blank -> EOL blank | yimixilu
repeat -> while ( expr ) blank block
programme_multi -> programme programme_multi | yimixilu
param -> id
params -> param more_para | yimixilu
param_list -> ( params )
function -> func $ id param_list function_block blank
exec -> $ id epl
more_para -> , params | yimixilu
back -> return expr ; | yimixilu
function_block -> { blank programme_multi back blank }
epl -> ( eps )
eps -> ep emp | yimixilu
emp -> , eps | yimixilu
ep -> id | number | string | exec
shuzu -> [ number ] shuzu | yimixilu
array -> & id shuzu
c_shuzu -> 开辟数组 shuzu

然后用的语法分析算法是LL算法,会有一些冲突,构建预测分析表的代码如下:

public class PredictAnalyzeTable {
    private List<String> terminalWords;
    private List<String> nonTerminalWords;
    private List<String> grammers;
    private String[][] table;
    public PredictAnalyzeTable(Reader reader){
        grammers = new ArrayList<>();
        terminalWords = new ArrayList<>();
        nonTerminalWords = new ArrayList<>();
        LineNumberReader r = new LineNumberReader(reader);
        String line;
        try {
            while ((line = r.readLine()) != null) {
                grammers.add(line);
            }
            classify();
            initTable();
            constructTable();
        }catch (IOException io){
            io.printStackTrace();
        }
    }
    private void initTable(){
        table = new String[nonTerminalWords.size()][terminalWords.size()];
    }
    private void constructTable(){
        for (String nonTerminalWord : nonTerminalWords) {
            first(nonTerminalWord);
        }
        bug();
        for (int i = 0;i<nonTerminalWords.size();i++){
            System.out.println(Arrays.toString(table[i]));
        }
    }
    private void classify(){
        for(String s : grammers){
            String[] strings = s.split(" ");
            nonTerminalWords.add(strings[0]);
        }
        for(String s : grammers){
            String[] strings = s.split(" ");
            for (String unit : strings){
                if(nonTerminalWords.contains(unit) || terminalWords.contains(unit) || unit.equals("|") || unit.equals("->") || unit.equals("yimixilu"))
                    continue;
                terminalWords.add(unit);
            }
        }
        System.out.println(nonTerminalWords);
        System.out.println(terminalWords);
    }

    private List<String> first(String nonTerminalWord){
        List<String> list = new ArrayList<>();
        int index = nonTerminalWords.indexOf(nonTerminalWord);
        String body = grammers.get(index).split("-> ")[1];
        String [] multiRoad = body.split(" \\| ");
        for(String road : multiRoad){
            String res = road.split(" ")[0];

            if(nonTerminalWords.contains(res)){
                List<String> firsts = first(res);
                list.addAll(firsts);
                for (String f : firsts){
                    String pi = table[nonTerminalWords.indexOf(nonTerminalWord)][terminalWords.indexOf(f)];
                    if(pi != null && !pi.equals(road))
                        System.out.println("conflict!");
                    table[nonTerminalWords.indexOf(nonTerminalWord)][terminalWords.indexOf(f)] = road;
                }
            }else {
                if(!list.contains(res)){        //如果第一个字符是终结符,则在表中放入对应的生产式
                    list.add(res);
                    if(res.equals("yimixilu")){
                        continue;
                    }
                    String pi = table[nonTerminalWords.indexOf(nonTerminalWord)][terminalWords.indexOf(res)];
                    if(pi != null && !pi.equals(road))
                        System.out.println("conflict!");
                    table[nonTerminalWords.indexOf(nonTerminalWord)][terminalWords.indexOf(res)] = road;
                }
            }

        }
        return list;
    }


    private void bug(){
        table[5][0] = table[5][3] = table[5][11] = table[5][14] = "yimixilu";
        table[8][1] = table[8][2] = table[8][4] = table[8][5] = table[8][6] = table[8][10] = table[8][11] = table[8][12] = table[8][13] = table[8][15] = table[8][19] = table[8][16] = table[8][22] = "yimixilu";
        table[8][0] = "EOL blank";
        table[8][14] = "EOL blank";
        table[10][6] = table[10][19] = "yimixilu";
        table[2][14] = "yimixilu";
        table[12][3] = "yimixilu";
        table[16][3] = table[16][0] = "yimixilu";
        table[17][6] = "yimixilu";
        table[8][17] = "yimixilu";
        table[20][3] = "yimixilu";
        table[21][3] = "yimixilu";
        table[23][0] = table[23][3] = table[23][9] = "yimixilu";
        table[2][12] = "yimixilu";
        table[2][1] = "yimixilu";
        table[2][15] = "yimixilu";
        table[2][16] = "yimixilu";
        table[2][19] = "yimixilu";
    }

    public List<String> getTerminalWords() {
        return terminalWords;
    }

    public List<String> getNonTerminalWords() {
        return nonTerminalWords;
    }

    public String[][] getTable() {
        return table;
    }

}

first函数

就是教程中的First()方法

Bug函数

这里本来是计算教材中的Follow的,但是想了很久都不会,于是直接自己算文法的follow,算完在相应的位置填上,其中yimixilu代表空,在后面碰到这个会直接跳过。

运行效果:

[programme, statement, magic, block, expr, right, factor, primary, blank, repeat, programme_multi, param, params, param_list, function, exec, more_para, back, function_block, epl, eps, emp, ep, shuzu, array, c_shuzu]
[;, if, (, ), else, {, }, true, false, double_operator, -, number, id, string, EOL, while, func, $, ,, return, [, ], &, 开辟数组]

这里的第一个中括号是指代非终结符号,第二个是终结符号,终结符号甚至可以是中文。

[null, statement blank, expr ; blank, null, null, null, null, expr ; blank, expr ; blank, null, expr ; blank, expr ; blank, expr ; blank, expr ; blank, null, repeat blank, function blank, expr ; blank, null, null, null, null, expr ; blank, expr ; blank]
[null, if ( expr ) blank block blank magic, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]
[null, yimixilu, null, null, else blank block, null, null, null, null, null, null, null, yimixilu, null, yimixilu, yimixilu, yimixilu, null, null, yimixilu, null, null, null, null]
[null, null, null, null, null, { blank programme_multi }, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]
[null, null, factor right, null, null, null, null, true, false, null, factor right, factor right, factor right, factor right, null, null, null, exec right, null, null, null, null, array right, c_shuzu]
[yimixilu, null, null, yimixilu, null, null, null, null, null, double_operator expr, null, yimixilu, null, null, yimixilu, null, null, null, null, null, null, null, null, null]
[null, null, primary, null, null, null, null, null, null, null, - primary, primary, primary, primary, null, null, null, null, null, null, null, null, null, null]
[null, null, ( expr ), null, null, null, null, null, null, null, null, number, id, string, null, null, null, null, null, null, null, null, null, null]
[EOL blank, yimixilu, yimixilu, null, yimixilu, yimixilu, yimixilu, null, null, null, yimixilu, yimixilu, yimixilu, yimixilu, EOL blank, yimixilu, yimixilu, yimixilu, null, yimixilu, null, null, yimixilu, null]
[null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, while ( expr ) blank block, null, null, null, null, null, null, null, null]
[null, programme programme_multi, programme programme_multi, null, null, null, yimixilu, programme programme_multi, programme programme_multi, null, programme programme_multi, programme programme_multi, programme programme_multi, programme programme_multi, null, programme programme_multi, programme programme_multi, programme programme_multi, null, yimixilu, null, null, programme programme_multi, programme programme_multi]
[null, null, null, null, null, null, null, null, null, null, null, null, id, null, null, null, null, null, null, null, null, null, null, null]
[null, null, null, yimixilu, null, null, null, null, null, null, null, null, param more_para, null, null, null, null, null, null, null, null, null, null, null]
[null, null, ( params ), null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]
[null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, func $ id param_list function_block blank, null, null, null, null, null, null, null]
[null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, $ id epl, null, null, null, null, null, null]
[yimixilu, null, null, yimixilu, null, null, null, null, null, null, null, null, null, null, null, null, null, null, , params, null, null, null, null, null]
[null, null, null, null, null, null, yimixilu, null, null, null, null, null, null, null, null, null, null, null, null, return expr ;, null, null, null, null]
[null, null, null, null, null, { blank programme_multi back blank }, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]
[null, null, ( eps ), null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]
[null, null, null, yimixilu, null, null, null, null, null, null, null, ep emp, ep emp, ep emp, null, null, null, ep emp, null, null, null, null, null, null]
[null, null, null, yimixilu, null, null, null, null, null, null, null, null, null, null, null, null, null, null, , eps, null, null, null, null, null]
[null, null, null, null, null, null, null, null, null, null, null, number, id, string, null, null, null, exec, null, null, null, null, null, null]
[yimixilu, null, null, yimixilu, null, null, null, null, null, yimixilu, null, null, null, null, null, null, null, null, null, null, [ number ] shuzu, null, null, null]
[null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, & id shuzu, null]
[null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, 开辟数组 shuzu]

这就是预测分析表了,相应的位置给出了当这一步是某个文法时,且碰到的符号是某个时,下一步究竟要做什么。

编译原理实践(一)

为了巩固在编译原理课程上的内容,我们会尝试写一个“编译器”前端,即包括:词法分析,语义分析,语法分析等。直白地说就是写一个自己的脚本语言,可以跑起来之类的。本笔记记录的是脚本语言:MengNan的创建过程,该语言运行在java上。

词法解析

词法解析的作用是告诉编译器代码中的某个单词是什么东西,比如:

int cat = 5;

这段代码在编译器看来应该是这样的:

int id = number;

其中id是变量的专有标记而number很明显用于统一表示数字,注意这里的数字不止整型,还可以:

int cat = 9.666;

带小数的也可以被认为是number

这里额外提一下,这个number的识别靠的是正则表达式:

[0-9]+(.[0-9]+)?

这个式子可以筛选数字

代码

Lexer.java
package Lexer;

import Exception.*;
import Token.*;

import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Lexer {
    public static String regexPattern = "\\s*((//.*)|(if|else|while)|([0-9]+(.[0-9]+)?)|(\"(\\\\\"|\\\\\\\\|\\\\n|[^\"])*\")|([A-Z_a-z][A-Z_a-z0-9]*)|(\\+\\+|--)|(==|[+\\-*/=><%])|([;(){}]))?";

    private Pattern pattern = Pattern.compile(regexPattern);
    private ArrayList<Token> queue = new ArrayList<>();
    private boolean hasMore;
    private LineNumberReader reader;
    private HashMap<String,Object> glossary;

    public Lexer(Reader reader){
        hasMore = true;
        this.reader = new LineNumberReader(reader);
        glossary = new HashMap<>();
    }
    public Token read() throws ParseException {
        if(fillQueue(0))
            return queue.remove(0);
        else
            return Token.EOF;
    }
    public Token peak() throws ParseException {
        if(fillQueue(1))
            return queue.get(1);
        else
            return Token.EOF;
    }
    private boolean fillQueue(int i) throws ParseException {
        while (i >= queue.size())
        {
            if(hasMore)
                readLine();
            else
                return false;
        }
        return true;            //一行的处理结果没有读完,读完再处理下一行
    }
    private void readLine() throws ParseException {
        String line;
        try {
            line = reader.readLine();
        }catch (IOException io){
            throw  new ParseException(io);
        }
        if(line == null){
            hasMore = false;
            return;
        }
        int lineNum = reader.getLineNumber();
        Matcher matcher = pattern.matcher(line);
        matcher.useTransparentBounds(true).useAnchoringBounds(false);
        int pos = 0;
        int endPos = line.length();
        while (pos < endPos){
            matcher.region(pos,endPos);
            //从目标字符串开始位置进行匹配。只有在有匹配且匹配的某一子串中包含目标字符串第一个字符的情况下才会返回true。
            if(matcher.lookingAt()){    
                addToken(lineNum,matcher);      //遇到正则表达式认识的单词,康康是哪个
                pos = matcher.end();            //滑动窗口往前走,把刚刚认识的单词踢出去
            }else
                throw new ParseException("wrong token at line" + lineNum);
        }
        queue.add(new SplitToken(lineNum, Token.EOL));
    }
    private void addToken(int num, Matcher matcher){
        String m = matcher.group(1);
        if(m != null){      //非空格
            if(matcher.group(2) == null){       //非注解,如果是注解直接跳过
                Token token = null;
                if(matcher.group(3) != null)    //这个token是关键字
                    token = new KeywordToken(num, m);
                else if(matcher.group(4) != null)       //是数字
                    token = new NumToken(num, Double.parseDouble(m));
                else if(matcher.group(6) != null)       //是字符串
                    token = new StrToken(num, beautifyString(m));
                else if(matcher.group(8) != null){      //是变量
                    token = new IdToken(num,m);
                    this.glossary.put(m, null);         //变量放到词汇表中
                }
                else if(matcher.group(9) != null)       //是操作符
                    token = new OperToken(num,m);
                else if(matcher.group(10) != null)      //是双目操作符
                    token = new DoubleOperToken(num,m);
                else if(matcher.group(11) != null)      //是分隔符,比如;{}()
                    token = new SplitToken(num,m);
                queue.add(token);       //将解析处来的token加入队列
            }
        }
    }

    private String beautifyString(String s){
        StringBuilder builder = new StringBuilder();
        int len = s.length() - 1;
        for(int i = 1; i < len; i++){       //从1开始是因为舍去第一个双引号
            char c = s.charAt(i);
            if(c == '\\' && i + 1 < len){
                char c2 = s.charAt(i + 1);
                if(c2 == '"' || c2 == '\\'){
                    c = s.charAt(++i);
                }else if(c2 == 'n'){
                    i++;
                    c = '\n';
                }
            }
            builder.append(c);
        }
        return builder.toString();
    }

    public void showGlossary(){
        System.out.println(this.glossary);
    }

    public HashMap<String, Object> getGlossary(){
        return glossary;
    }
}

Token.java
package Token;

public class Token {
    public static final Token EOF = new Token(-1){};
    public static final String EOL = "EOL";
    private int lineNumber;
    public Token(int line){
        lineNumber = line;
    }
    public String getType(){return "none";}
    public boolean isKeyword(){return false;}
    public boolean isOperator(){return false;}
    public boolean isId(){return false;}
    public boolean isNumber(){return false;}
    public boolean isString(){return false;}
    public String getText(){return "";}
    public void setTypeToText(){}
    public int getLineNumber() {
        return lineNumber;
    }
}
KeywordToken.java
package Token;

public class KeywordToken extends Token {
    private String key;
    private String type = "keyword";
    public KeywordToken(int line,String key) {
        super(line);
        this.key = key;
    }

    @Override
    public String getType() {
        return type;
    }

    @Override
    public boolean isKeyword() {
        return true;
    }

    @Override
    public String getText() {
        return key;
    }

}

其他的token类与上面展示的类似

bootstrap.java
public class bootstrap {
    public static void main(String[] args) throws ParseException, IOException {
        Lexer lexer = new Lexer(new FileReader("D:/code.txt"));
        for (Token t; (t = lexer.read()) != Token.EOF;)
            System.out.println(" === > " + t.getText() + "  " + t.getType());
    }
}
code.txt
ppp = 5 + 5;
if(ppp > 55){
    ppp = "hello world";
}else{
    ppp = "bye";
}
flag = 1
while(flag > 0){

}

运行结果

 === > ppp  id
 === > =  double_operator
 === > 5.0  number
 === > +  double_operator
 === > 5.0  number
 === > ;  split
 === > EOL  split
 === > if  keyword
 === > (  split
 === > ppp  id
 === > >  double_operator
 === > 55.0  number
 === > )  split
 === > {  split
 === > EOL  split
 === > ppp  id
 === > =  double_operator
 === > hello world  string
 === > ;  split
 === > EOL  split
 === > }  split
 === > else  keyword
 === > {  split
 === > EOL  split
 === > ppp  id
 === > =  double_operator
 === > bye  string
 === > ;  split
 === > EOL  split
 === > }  split
 === > EOL  split
 === > flag  id
 === > =  double_operator
 === > 1.0  number
 === > EOL  split
 === > while  keyword
 === > (  split
 === > flag  id
 === > >  double_operator
 === > 0.0  number
 === > )  split
 === > {  split
 === > EOL  split
 === > EOL  split
 === > }  split
 === > EOL  split

Process finished with exit code 0