Wrote a Clite parser– in Prolog!
parser.pl:
% Clite Parser % 2/6/13 % Justin Mangue % % Builds the parse tree for a Clite program according to the concrete syntax given in % Tucker & Noonan's "Programming Languages Principles and Paradigms" (2nd ed.) pg 38. % Pretty-prints the result either to terminal or to file. % % The input file must be a file containing a list of tokens generated by my lexer.pl % run on the original Clite source. % % Call with: parse('sourcefile'). to output parse tree to terminal % parse('sourcefile','destfile'). to output parse to specified file. % Starting call, Output to terminal. parse(FileIn) :- runparser(FileIn, user_output). % Starting call, Output to file. parse(FileIn, FileOut) :- open(FileOut, write, OutFileStream), runparser(FileIn, OutFileStream), close(OutFileStream). % Main Routine runparser(FileIn, OutStream) :- open(FileIn, read, InFileStream), build_token_list(InFileStream, TokenList), close(InFileStream), build_parse_tree(TokenList, ParseTree), print_parse_tree(ParseTree, OutStream). %----- Token List Functions-----% % Converts the token stream into a list of pairs of format (Token, Token_Type). build_token_list(InStream, _) :- at_end_of_stream(InStream), !. build_token_list(InStream, TokenList) :- + at_end_of_stream(InStream), readToken(InStream, Token), readToken(InStream, Token_Type), build_token_list(InStream, NextToken), TokenList = [(Token, Token_Type) | NextToken]. % Grabs the next ASCII code out of the stream and builds into a Word, then unifies it back into an atom. readToken(InStream, Word):- get_code(InStream, Char), buildtoken(Char, Chars, InStream), atom_codes(Word, Chars). % The following ASCII codes indicate the end of a word. buildtoken(10,[],_):- !. % newline buildtoken(9,[],_):- !. % tab buildtoken(32,[],_):- !. % space buildtoken(-1,[],_):- !. % end of stream buildtoken(end_of_file,[],_):- !. % eof % Grab chars and append them to our list until we hit the end of a word, as defined above. buildtoken(Char,[Char|Rest],InStream):- get_code(InStream,NextChar), buildtoken(NextChar,Rest,InStream). %----- Token List Functions-----% % Initial call to the DCG functionality build_parse_tree(Tokens, ParseTree) :- s_program(ParseTree, Tokens, []). %----- DCG Definitions for Clite-----% % Used < > in place of EBNF metabrackets in the comments, % and _{_ _}_ to designate EBNF metabraces. % Had to make 4 different definitions for Program to allow for programs with no declarations and/or statements. % This is a bit of a workaround to keep the parse tree tidy. % Program -> int main ( ) { Declarations Statements } s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), statements(Statements), RCBTok)) --> [(IntTok, 'INT')], [(MainTok, 'MAIN')], [(LPTok, 'LPAREN')], [(RPTok, 'RPAREN')], [(LCBTok, 'LBRACE')], s_declarations(Declarations), s_statements(Statements), [(RCBTok, 'RBRACE')], !. % Program -> int main ( ) { Declarations } s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), RCBTok)) --> [(IntTok, 'INT')], [(MainTok, 'MAIN')], [(LPTok, 'LPAREN')], [(RPTok, 'RPAREN')], [(LCBTok, 'LBRACE')], s_declarations(Declarations), [(RCBTok, 'RBRACE')], !. % Program -> int main ( ) { Statements } s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, statements(Statements), RCBTok)) --> [(IntTok, 'INT')], [(MainTok, 'MAIN')], [(LPTok, 'LPAREN')], [(RPTok, 'RPAREN')], [(LCBTok, 'LBRACE')], s_statements(Statements), [(RCBTok, 'RBRACE')], !. % Program -> int main ( ) { } s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, RCBTok)) --> [(IntTok, 'INT')], [(MainTok, 'MAIN')], [(LPTok, 'LPAREN')], [(RPTok, 'RPAREN')], [(LCBTok, 'LBRACE')], [(RCBTok, 'RBRACE')], !. % Declarations -> _{_ Declaration _}_ s_declarations(Decl) --> s_declaration(Decl). s_declarations((Decl, MoreDecls)) --> s_declaration(Decl), s_declarations(MoreDecls). s_declarations(_) --> []. % Declaration -> Type Identifier <[Integer]>_{_, Identifier <[Integer]>_}_; s_declaration(declaration(Type, Ident, SemiTok)) --> s_type(Type), s_identifier(Ident), [(SemiTok, 'SEMICOL')]. s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, SemiTok)) --> s_type(Type), s_identifier(Ident), [(LBTok, 'LBRACK')], s_intvalue(Int), [(RBTok, 'RBRACK')], [(SemiTok, 'SEMICOL')]. s_declaration(declaration(Type, Ident, CommaTok, NextDecl)) --> s_type(Type), s_identifier(Ident), [(CommaTok, 'COMMA')], s_declaration_commaseparatedlist(NextDecl). s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) --> s_type(Type), s_identifier(Ident), [(LBTok, 'LBRACK')], s_intvalue(Int), [(RBTok, 'RBRACK')], [(CommaTok, 'COMMA')], s_declaration_commaseparatedlist(NextDecl). s_declaration_commaseparatedlist(comma(Ident, SemiTok)) --> s_identifier(Ident), [(SemiTok, 'SEMICOL')]. s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, SemiTok)) --> s_identifier(Ident), [(LBTok, 'LBRACK')], s_intvalue(Int), [(RBTok, 'RBRACK')], [(SemiTok, 'SEMICOL')]. s_declaration_commaseparatedlist(comma(Ident, CommaTok, NextDecl)) --> s_identifier(Ident), [(CommaTok, 'COMMA')], s_declaration_commaseparatedlist(NextDecl). s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) --> s_identifier(Ident), [(LBTok, 'LBRACK')], s_intvalue(Int), [(RBTok, 'RBRACK')], [(CommaTok, 'COMMA')], s_declaration_commaseparatedlist(NextDecl). s_type(type(TypeTok)) --> [(TypeTok, 'INT')] ; [(TypeTok, 'BOOL')] ; [(TypeTok, 'FLOAT')] ; [(TypeTok, 'CHAR')]. % Statements -> _{_Statement_}_ s_statements(Statement) --> s_statement(Statement). s_statements((Statement, MoreStatements)) --> s_statement(Statement), s_statements(MoreStatements). s_statements(_) --> []. % Statement -> ; | Block | Assignment | IfStatement | WhileStatement s_statement(statement(SemiColTok)) --> [(SemiColTok, 'SEMICOL')]. s_statement(statement(Block)) --> s_block(Block). s_statement(statement(Assign)) --> s_assignment(Assign). s_statement(statement(IfStatement)) --> s_if_statement(IfStatement). s_statement(statement(WhileStatement)) --> s_while_statement(WhileStatement). % Block -> { Statements } s_block(block(LCBTok, Statements, RCBTok)) --> [(LCBTok, 'LBRACE')], s_statements(Statements), [(RCBTok, 'RBRACE')]. % Assignment -> Identifier <[Expression]> = Expression; s_assignment(assignment(Target, AssignOpTok, Source, SemiTok)) --> s_identifier(Target), [(AssignOpTok, 'ASSIGN')], s_expression(Source), [(SemiTok, 'SEMICOL')]. s_assignment(assignment(Target, LBTok, Expr, RBTok, AssignOpTok, Source, SemiTok)) --> s_identifier(Target), [(LBTok, 'LBRACK')], s_expression(Expr), [(RBTok, 'RBRACK')], [(AssignOpTok, 'ASSIGN')], s_expression(Source), [(SemiTok, 'SEMICOL')]. % IfStatement -> if ( Expression ) Statement s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock)) --> [(IfTok, 'IF')], [(LPTok, 'LPAREN')], s_expression(Test), [(RPTok, 'RPAREN')], s_statement(ThenBlock). s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock, ElseTok, ElseBlock)) --> [(IfTok, 'IF')], [(LPTok, 'LPAREN')], s_expression(Test), [(RPTok, 'RPAREN')], s_statement(ThenBlock), [(ElseTok, 'ELSE')], s_statement(ElseBlock). % WhileStatement -> while ( Expression ) Statement s_while_statement(while_statement(WhileTok, LPTok, Test, RPTok, Statement)) --> [(WhileTok, 'WHILE')], [(LPTok, 'LPAREN')], s_expression(Test), [(RPTok, 'RPAREN')], s_statement(Statement). % Expression -> Conjunction _{_|| Conjunction _}_ s_expression(expression(Expr)) --> s_conjunction(Expr). s_expression(expression(Expr1, OrOpTok, Expr2)) --> s_conjunction(Expr1), [(OrOpTok, 'OR_OP')], s_expression(Expr2). % Conjunction -> Equality _{_&& Equality _}_ s_conjunction(conjunction(Conj)) --> s_equality(Conj). s_conjunction(conjunction(Conj1, AndOpTok, Conj2)) --> s_equality(Conj1), [(AndOpTok, 'AND_OP')], s_conjunction(Conj2). % Equality -> Relation s_equality(equality(Eq)) --> s_relation(Eq). s_equality(equality(Eq1, EqOp, Eq2)) --> s_relation(Eq1), s_equ_op(EqOp), s_relation(Eq2). % EquOp -> == | != s_equ_op(equ_op(EQOp)) --> [(EQOp, 'EQ_OP')] ; [(EQOp, 'NE_OP')]. % Relation -> Addition s_relation(relation(Rel)) --> s_addition(Rel). s_relation(relation(Rel1, RO, Rel2)) --> s_addition(Rel1), s_rel_op(RO), s_addition(Rel2). % RelOp -> < | <= | > | >= s_rel_op(rel_op(ROp)) --> [(ROp, 'LT_OP')] ; [(ROp, 'LE_OP')] ; [(ROp, 'GT_OP')] ; [(ROp, 'GE_OP')]. % Addition -> Term _{_AddOp Term_}_ s_addition(addition(Add)) --> s_term(Add). s_addition(addition(Add1, AOp, Add2)) --> s_term(Add1), s_add_op(AOp), s_addition(Add2). % AddOp -> + | - s_add_op(add_op(AOp)) --> [(AOp, 'ADD_OP')] ; [(AOp, 'SUB_OP')]. % Term -> Factor _{_MulOp Factor_}_ s_term(term(Term)) --> s_factor(Term). s_term(term(Term1, MOp, Term2)) --> s_factor(Term1), s_mul_op(MOp), s_term(Term2). % MulOp -> * | / | % s_mul_op(mul_op(MOp)) --> [(MOp, 'MUL_OP')] ; [(MOp, 'DIV_OP')] ; [(MOp, 'MOD_OP')]. % Factor -> Primary s_factor(factor(Factor)) --> s_primary(Factor). s_factor(factor(UOp, Factor)) --> s_unary_op(UOp), s_primary(Factor). % UnaryOp -> - | ! s_unary_op(unary_op(UOp)) --> [(UOp, 'SUB_OP')] ; [(UOp, 'NOT_OP')]. % Primary -> Identifier <[Expression]> | Literal | ( Expression ) | Type ( Expression ) s_primary(primary(Prim)) --> s_identifier(Prim). s_primary(primary(Id, LBTok, Expr, RBTok)) --> s_identifier(Id) ; [(LBTok, 'LBRACK')], s_expression(Expr), [(RBTok, 'RBRACK')]. s_primary(primary(Prim)) --> s_literal(Prim). s_primary(primary(LPTok, Expr, RPTok)) --> [(LPTok, 'LPAREN')], s_expression(Expr), [(RPTok, 'RPAREN')]. s_primary(primary(Type, LPTok, Expr, RPTok)) --> s_type(type(Type)), [(LPTok, 'LPAREN')], s_expression(Expr), [(RPTok, 'RPAREN')]. % Identifier -> Letter _{_ Letter | Digit _}_ % Our lexer already handles this. s_identifier(identifier(Ident)) --> [(Ident, 'IDENT')]. % Literal -> Integer | Boolean | Float | Char % Our lexer handled these already. s_literal(literal(Val)) --> s_intvalue(Val) ; s_floatvalue(Val) ; s_boolvalue(Val) ; s_charvalue(Val). s_intvalue(integer(Val)) --> [(Val, 'INTEGER')]. s_boolvalue(boolean(Val)) --> [(Val, 'BOOLVAL')]. s_floatvalue(float(Val)) --> [(Val, 'FLOAT')]. s_charvalue(char(Val)) --> [(Val, 'CHAR_LIT')]. %----- Pretty Printing Functions-----% % These handle converting our parse tree, currently a messy Prolog ground term, % into a nicely-formatted and indented rendition. % Initial call handler. print_parse_tree(Term_ParseTree, OutStream) :- write_to_codes(Term_ParseTree, Codes_ParseTree), print_indented(Codes_ParseTree, 0, OutStream). % Main printing routine. Clears punctuation and adjusts indentation, % then prints a word and recursively calls itself. print_indented([],_,_). print_indented(Codes, Level, OutStream) :- handle_punctuation(Codes, Rest, Level, NewLevel, OutStream), print_word(Rest, Tail, NewLevel, OutStream), print_indented(Tail, NewLevel, OutStream), !. % handle_punctuation handles adjusting indentation and clearing punctuation from the parse tree. handle_punctuation([], [], _, _). handle_punctuation([40|Codes], Rest, Level, NewLevel, OutStream) :- % Case ( NewLevel is Level + 1, handle_punctuation(Codes, Rest, NewLevel, _, OutStream), !. handle_punctuation([123|Codes], Rest, Level, FinalLevel, OutStream) :- % Case { NewLevel is Level + 1, handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !. handle_punctuation([41|Codes], Rest, Level, FinalLevel, OutStream) :- % Case ) NewLevel is Level - 1, handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !. handle_punctuation([125|Codes], Rest, Level, FinalLevel, OutStream) :- % Case } NewLevel is Level - 1, handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !. handle_punctuation([44|Codes], Rest, Level, FinalLevel, OutStream) :- % Case , handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !. handle_punctuation([32|Codes], Rest, Level, FinalLevel, OutStream) :- % Case ' ' handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !. handle_punctuation(Codes, Codes, Level, Level, _). % No punctuation found. % print_word handles indentation, calls the word-fetcher, and outputs a newline. print_word([], Rest, _, _) :- Rest = []. print_word(Codes, Rest, Level, OutStream) :- NumTabs is Level * 2, tab(OutStream, NumTabs), get_word(Codes, Rest, Level, OutStream), put_char(OutStream, 'n'), !. % get_word builds a string until the specified punctuation is found. get_word([], _, _, _). % End of word get_word([40|Codes], [40|Codes], _, _) :- !. % ( get_word([41|Codes], [41|Codes], _, _) :- !. % ) get_word([44|Codes], [44|Codes], _, _) :- !. % , get_word([32|Codes], [32|Codes], _, _) :- !. % ' ' get_word([123|Codes], [123|Codes], _, _) :- !. % { get_word([125|Codes], [125|Codes], _, _) :- !. % } get_word([Char|Codes], Rest, Level, OutStream) :- put_code(OutStream, Char), get_word(Codes, Rest, Level, OutStream).