Wrote a Clite parser– in Prolog!
parser.pl:
% Clite Parser
% 2/6/13
% Justin Mangue
%
% Builds the parse tree for a Clite program according to the concrete syntax given in
% Tucker & Noonan's "Programming Languages Principles and Paradigms" (2nd ed.) pg 38.
% Pretty-prints the result either to terminal or to file.
%
% The input file must be a file containing a list of tokens generated by my lexer.pl
% run on the original Clite source.
%
% Call with: parse('sourcefile'). to output parse tree to terminal
% parse('sourcefile','destfile'). to output parse to specified file.
% Starting call, Output to terminal.
parse(FileIn) :-
runparser(FileIn, user_output).
% Starting call, Output to file.
parse(FileIn, FileOut) :-
open(FileOut, write, OutFileStream),
runparser(FileIn, OutFileStream),
close(OutFileStream).
% Main Routine
runparser(FileIn, OutStream) :-
open(FileIn, read, InFileStream),
build_token_list(InFileStream, TokenList),
close(InFileStream),
build_parse_tree(TokenList, ParseTree),
print_parse_tree(ParseTree, OutStream).
%----- Token List Functions-----%
% Converts the token stream into a list of pairs of format (Token, Token_Type).
build_token_list(InStream, _) :-
at_end_of_stream(InStream), !.
build_token_list(InStream, TokenList) :-
+ at_end_of_stream(InStream),
readToken(InStream, Token),
readToken(InStream, Token_Type),
build_token_list(InStream, NextToken),
TokenList = [(Token, Token_Type) | NextToken].
% Grabs the next ASCII code out of the stream and builds into a Word, then unifies it back into an atom.
readToken(InStream, Word):-
get_code(InStream, Char),
buildtoken(Char, Chars, InStream),
atom_codes(Word, Chars).
% The following ASCII codes indicate the end of a word.
buildtoken(10,[],_):- !. % newline
buildtoken(9,[],_):- !. % tab
buildtoken(32,[],_):- !. % space
buildtoken(-1,[],_):- !. % end of stream
buildtoken(end_of_file,[],_):- !. % eof
% Grab chars and append them to our list until we hit the end of a word, as defined above.
buildtoken(Char,[Char|Rest],InStream):-
get_code(InStream,NextChar),
buildtoken(NextChar,Rest,InStream).
%----- Token List Functions-----%
% Initial call to the DCG functionality
build_parse_tree(Tokens, ParseTree) :-
s_program(ParseTree, Tokens, []).
%----- DCG Definitions for Clite-----%
% Used < > in place of EBNF metabrackets in the comments,
% and _{_ _}_ to designate EBNF metabraces.
% Had to make 4 different definitions for Program to allow for programs with no declarations and/or statements.
% This is a bit of a workaround to keep the parse tree tidy.
% Program -> int main ( ) { Declarations Statements }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), statements(Statements), RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_declarations(Declarations),
s_statements(Statements),
[(RCBTok, 'RBRACE')], !.
% Program -> int main ( ) { Declarations }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_declarations(Declarations),
[(RCBTok, 'RBRACE')], !.
% Program -> int main ( ) { Statements }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, statements(Statements), RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_statements(Statements),
[(RCBTok, 'RBRACE')], !.
% Program -> int main ( ) { }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
[(RCBTok, 'RBRACE')], !.
% Declarations -> _{_ Declaration _}_
s_declarations(Decl) --> s_declaration(Decl).
s_declarations((Decl, MoreDecls)) -->
s_declaration(Decl),
s_declarations(MoreDecls).
s_declarations(_) --> [].
% Declaration -> Type Identifier <[Integer]>_{_, Identifier <[Integer]>_}_;
s_declaration(declaration(Type, Ident, SemiTok)) -->
s_type(Type),
s_identifier(Ident),
[(SemiTok, 'SEMICOL')].
s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, SemiTok)) -->
s_type(Type),
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(SemiTok, 'SEMICOL')].
s_declaration(declaration(Type, Ident, CommaTok, NextDecl)) -->
s_type(Type),
s_identifier(Ident),
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) -->
s_type(Type),
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
s_declaration_commaseparatedlist(comma(Ident, SemiTok)) -->
s_identifier(Ident),
[(SemiTok, 'SEMICOL')].
s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, SemiTok)) -->
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(SemiTok, 'SEMICOL')].
s_declaration_commaseparatedlist(comma(Ident, CommaTok, NextDecl)) -->
s_identifier(Ident),
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) -->
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
s_type(type(TypeTok)) -->
[(TypeTok, 'INT')] ;
[(TypeTok, 'BOOL')] ;
[(TypeTok, 'FLOAT')] ;
[(TypeTok, 'CHAR')].
% Statements -> _{_Statement_}_
s_statements(Statement) --> s_statement(Statement).
s_statements((Statement, MoreStatements)) --> s_statement(Statement), s_statements(MoreStatements).
s_statements(_) --> [].
% Statement -> ; | Block | Assignment | IfStatement | WhileStatement
s_statement(statement(SemiColTok)) --> [(SemiColTok, 'SEMICOL')].
s_statement(statement(Block)) --> s_block(Block).
s_statement(statement(Assign)) --> s_assignment(Assign).
s_statement(statement(IfStatement)) --> s_if_statement(IfStatement).
s_statement(statement(WhileStatement)) --> s_while_statement(WhileStatement).
% Block -> { Statements }
s_block(block(LCBTok, Statements, RCBTok)) -->
[(LCBTok, 'LBRACE')],
s_statements(Statements),
[(RCBTok, 'RBRACE')].
% Assignment -> Identifier <[Expression]> = Expression;
s_assignment(assignment(Target, AssignOpTok, Source, SemiTok)) -->
s_identifier(Target),
[(AssignOpTok, 'ASSIGN')],
s_expression(Source),
[(SemiTok, 'SEMICOL')].
s_assignment(assignment(Target, LBTok, Expr, RBTok, AssignOpTok, Source, SemiTok)) -->
s_identifier(Target),
[(LBTok, 'LBRACK')],
s_expression(Expr),
[(RBTok, 'RBRACK')],
[(AssignOpTok, 'ASSIGN')],
s_expression(Source),
[(SemiTok, 'SEMICOL')].
% IfStatement -> if ( Expression ) Statement
s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock)) -->
[(IfTok, 'IF')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(ThenBlock).
s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock, ElseTok, ElseBlock)) -->
[(IfTok, 'IF')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(ThenBlock),
[(ElseTok, 'ELSE')],
s_statement(ElseBlock).
% WhileStatement -> while ( Expression ) Statement
s_while_statement(while_statement(WhileTok, LPTok, Test, RPTok, Statement)) -->
[(WhileTok, 'WHILE')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(Statement).
% Expression -> Conjunction _{_|| Conjunction _}_
s_expression(expression(Expr)) --> s_conjunction(Expr).
s_expression(expression(Expr1, OrOpTok, Expr2)) -->
s_conjunction(Expr1),
[(OrOpTok, 'OR_OP')],
s_expression(Expr2).
% Conjunction -> Equality _{_&& Equality _}_
s_conjunction(conjunction(Conj)) --> s_equality(Conj).
s_conjunction(conjunction(Conj1, AndOpTok, Conj2)) -->
s_equality(Conj1),
[(AndOpTok, 'AND_OP')],
s_conjunction(Conj2).
% Equality -> Relation
s_equality(equality(Eq)) --> s_relation(Eq).
s_equality(equality(Eq1, EqOp, Eq2)) -->
s_relation(Eq1),
s_equ_op(EqOp),
s_relation(Eq2).
% EquOp -> == | !=
s_equ_op(equ_op(EQOp)) -->
[(EQOp, 'EQ_OP')] ;
[(EQOp, 'NE_OP')].
% Relation -> Addition
s_relation(relation(Rel)) --> s_addition(Rel).
s_relation(relation(Rel1, RO, Rel2)) --> s_addition(Rel1), s_rel_op(RO), s_addition(Rel2).
% RelOp -> < | <= | > | >=
s_rel_op(rel_op(ROp)) -->
[(ROp, 'LT_OP')] ;
[(ROp, 'LE_OP')] ;
[(ROp, 'GT_OP')] ;
[(ROp, 'GE_OP')].
% Addition -> Term _{_AddOp Term_}_
s_addition(addition(Add)) --> s_term(Add).
s_addition(addition(Add1, AOp, Add2)) -->
s_term(Add1),
s_add_op(AOp),
s_addition(Add2).
% AddOp -> + | -
s_add_op(add_op(AOp)) -->
[(AOp, 'ADD_OP')] ;
[(AOp, 'SUB_OP')].
% Term -> Factor _{_MulOp Factor_}_
s_term(term(Term)) --> s_factor(Term).
s_term(term(Term1, MOp, Term2)) -->
s_factor(Term1),
s_mul_op(MOp),
s_term(Term2).
% MulOp -> * | / | %
s_mul_op(mul_op(MOp)) -->
[(MOp, 'MUL_OP')] ;
[(MOp, 'DIV_OP')] ;
[(MOp, 'MOD_OP')].
% Factor -> Primary
s_factor(factor(Factor)) --> s_primary(Factor).
s_factor(factor(UOp, Factor)) --> s_unary_op(UOp), s_primary(Factor).
% UnaryOp -> - | !
s_unary_op(unary_op(UOp)) -->
[(UOp, 'SUB_OP')] ;
[(UOp, 'NOT_OP')].
% Primary -> Identifier <[Expression]> | Literal | ( Expression ) | Type ( Expression )
s_primary(primary(Prim)) --> s_identifier(Prim).
s_primary(primary(Id, LBTok, Expr, RBTok)) -->
s_identifier(Id) ;
[(LBTok, 'LBRACK')],
s_expression(Expr),
[(RBTok, 'RBRACK')].
s_primary(primary(Prim)) --> s_literal(Prim).
s_primary(primary(LPTok, Expr, RPTok)) -->
[(LPTok, 'LPAREN')],
s_expression(Expr),
[(RPTok, 'RPAREN')].
s_primary(primary(Type, LPTok, Expr, RPTok)) -->
s_type(type(Type)),
[(LPTok, 'LPAREN')],
s_expression(Expr),
[(RPTok, 'RPAREN')].
% Identifier -> Letter _{_ Letter | Digit _}_
% Our lexer already handles this.
s_identifier(identifier(Ident)) --> [(Ident, 'IDENT')].
% Literal -> Integer | Boolean | Float | Char
% Our lexer handled these already.
s_literal(literal(Val)) -->
s_intvalue(Val) ;
s_floatvalue(Val) ;
s_boolvalue(Val) ;
s_charvalue(Val).
s_intvalue(integer(Val)) --> [(Val, 'INTEGER')].
s_boolvalue(boolean(Val)) --> [(Val, 'BOOLVAL')].
s_floatvalue(float(Val)) --> [(Val, 'FLOAT')].
s_charvalue(char(Val)) --> [(Val, 'CHAR_LIT')].
%----- Pretty Printing Functions-----%
% These handle converting our parse tree, currently a messy Prolog ground term,
% into a nicely-formatted and indented rendition.
% Initial call handler.
print_parse_tree(Term_ParseTree, OutStream) :-
write_to_codes(Term_ParseTree, Codes_ParseTree),
print_indented(Codes_ParseTree, 0, OutStream).
% Main printing routine. Clears punctuation and adjusts indentation,
% then prints a word and recursively calls itself.
print_indented([],_,_).
print_indented(Codes, Level, OutStream) :-
handle_punctuation(Codes, Rest, Level, NewLevel, OutStream),
print_word(Rest, Tail, NewLevel, OutStream),
print_indented(Tail, NewLevel, OutStream), !.
% handle_punctuation handles adjusting indentation and clearing punctuation from the parse tree.
handle_punctuation([], [], _, _).
handle_punctuation([40|Codes], Rest, Level, NewLevel, OutStream) :- % Case (
NewLevel is Level + 1,
handle_punctuation(Codes, Rest, NewLevel, _, OutStream), !.
handle_punctuation([123|Codes], Rest, Level, FinalLevel, OutStream) :- % Case {
NewLevel is Level + 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([41|Codes], Rest, Level, FinalLevel, OutStream) :- % Case )
NewLevel is Level - 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([125|Codes], Rest, Level, FinalLevel, OutStream) :- % Case }
NewLevel is Level - 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([44|Codes], Rest, Level, FinalLevel, OutStream) :- % Case ,
handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !.
handle_punctuation([32|Codes], Rest, Level, FinalLevel, OutStream) :- % Case ' '
handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !.
handle_punctuation(Codes, Codes, Level, Level, _). % No punctuation found.
% print_word handles indentation, calls the word-fetcher, and outputs a newline.
print_word([], Rest, _, _) :- Rest = [].
print_word(Codes, Rest, Level, OutStream) :-
NumTabs is Level * 2,
tab(OutStream, NumTabs),
get_word(Codes, Rest, Level, OutStream),
put_char(OutStream, 'n'), !.
% get_word builds a string until the specified punctuation is found.
get_word([], _, _, _). % End of word
get_word([40|Codes], [40|Codes], _, _) :- !. % (
get_word([41|Codes], [41|Codes], _, _) :- !. % )
get_word([44|Codes], [44|Codes], _, _) :- !. % ,
get_word([32|Codes], [32|Codes], _, _) :- !. % ' '
get_word([123|Codes], [123|Codes], _, _) :- !. % {
get_word([125|Codes], [125|Codes], _, _) :- !. % }
get_word([Char|Codes], Rest, Level, OutStream) :-
put_code(OutStream, Char),
get_word(Codes, Rest, Level, OutStream).
