Prolog implementation of a C-lite parser

Wrote a Clite parser– in Prolog!

parser.pl:

% Clite Parser
% 2/6/13
% Justin Mangue
%
% Builds the parse tree for a Clite program according to the concrete syntax given in
% Tucker & Noonan's "Programming Languages Principles and Paradigms" (2nd ed.) pg 38.
% Pretty-prints the result either to terminal or to file.
%
% The input file must be a file containing a list of tokens generated by my lexer.pl
% run on the original Clite source.
%
% Call with: parse('sourcefile').             to output parse tree to terminal
%            parse('sourcefile','destfile').  to output parse to specified file.

% Starting call, Output to terminal.
parse(FileIn) :-
runparser(FileIn, user_output).

% Starting call, Output to file.
parse(FileIn, FileOut) :-
open(FileOut, write, OutFileStream),
runparser(FileIn, OutFileStream),
close(OutFileStream).

% Main Routine
runparser(FileIn, OutStream) :-
open(FileIn, read, InFileStream),
build_token_list(InFileStream, TokenList),
close(InFileStream),
build_parse_tree(TokenList, ParseTree),
print_parse_tree(ParseTree, OutStream).

%----- Token List Functions-----%
% Converts the token stream into a list of pairs of format (Token, Token_Type).

build_token_list(InStream, _) :-
at_end_of_stream(InStream), !.

build_token_list(InStream, TokenList) :-
+ at_end_of_stream(InStream),
readToken(InStream, Token),
readToken(InStream, Token_Type),
build_token_list(InStream, NextToken),
TokenList = [(Token, Token_Type) | NextToken].

% Grabs the next ASCII code out of the stream and builds into a Word, then unifies it back into an atom.
readToken(InStream, Word):-
get_code(InStream, Char),
buildtoken(Char, Chars, InStream),
atom_codes(Word, Chars).

% The following ASCII codes indicate the end of a word.
buildtoken(10,[],_):-  !. 		   % newline
buildtoken(9,[],_):-  !. 		   % tab
buildtoken(32,[],_):-  !. 		   % space
buildtoken(-1,[],_):-  !. 		   % end of stream
buildtoken(end_of_file,[],_):-  !.  % eof

% Grab chars and append them to our list until we hit the end of a word, as defined above.
buildtoken(Char,[Char|Rest],InStream):-
get_code(InStream,NextChar),
buildtoken(NextChar,Rest,InStream).

%----- Token List Functions-----%

% Initial call to the DCG functionality
build_parse_tree(Tokens, ParseTree) :-
s_program(ParseTree, Tokens, []).

%----- DCG Definitions for Clite-----%
% Used < > in place of EBNF metabrackets in the comments,
% and _{_ _}_ to designate EBNF metabraces.

% Had to make 4 different definitions for Program to allow for programs with no declarations and/or statements.
% This is a bit of a workaround to keep the parse tree tidy.

% Program -> int main ( ) { Declarations Statements }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), statements(Statements), RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_declarations(Declarations),
s_statements(Statements),
[(RCBTok, 'RBRACE')], !.

% Program -> int main ( ) { Declarations }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_declarations(Declarations),
[(RCBTok, 'RBRACE')], !.

% Program -> int main ( ) { Statements }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, statements(Statements), RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_statements(Statements),
[(RCBTok, 'RBRACE')], !.

% Program -> int main ( ) { }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, RCBTok)) -->
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
[(RCBTok, 'RBRACE')], !.

% Declarations -> _{_ Declaration _}_
s_declarations(Decl) --> s_declaration(Decl).
s_declarations((Decl, MoreDecls)) -->
s_declaration(Decl),
s_declarations(MoreDecls).
s_declarations(_) --> [].

% Declaration -> Type Identifier <[Integer]>_{_, Identifier <[Integer]>_}_;
s_declaration(declaration(Type, Ident, SemiTok)) -->
s_type(Type),
s_identifier(Ident),
[(SemiTok, 'SEMICOL')].

s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, SemiTok)) -->
s_type(Type),
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(SemiTok, 'SEMICOL')].

s_declaration(declaration(Type, Ident, CommaTok, NextDecl)) -->
s_type(Type),
s_identifier(Ident),
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).

s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) -->
s_type(Type),
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).

s_declaration_commaseparatedlist(comma(Ident, SemiTok)) -->
s_identifier(Ident),
[(SemiTok, 'SEMICOL')].

s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, SemiTok)) -->
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(SemiTok, 'SEMICOL')].

s_declaration_commaseparatedlist(comma(Ident, CommaTok, NextDecl)) -->
s_identifier(Ident),
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).

s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) -->
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).

s_type(type(TypeTok)) -->
[(TypeTok, 'INT')] ;
[(TypeTok, 'BOOL')] ;
[(TypeTok, 'FLOAT')] ;
[(TypeTok, 'CHAR')].

% Statements -> _{_Statement_}_
s_statements(Statement) --> s_statement(Statement).
s_statements((Statement, MoreStatements)) --> s_statement(Statement), s_statements(MoreStatements).
s_statements(_) --> [].

% Statement -> ; | Block | Assignment | IfStatement | WhileStatement
s_statement(statement(SemiColTok)) --> [(SemiColTok, 'SEMICOL')].
s_statement(statement(Block)) --> s_block(Block).
s_statement(statement(Assign)) --> s_assignment(Assign).
s_statement(statement(IfStatement)) --> s_if_statement(IfStatement).
s_statement(statement(WhileStatement)) --> s_while_statement(WhileStatement).

% Block -> { Statements }
s_block(block(LCBTok, Statements, RCBTok)) -->
[(LCBTok, 'LBRACE')],
s_statements(Statements),
[(RCBTok, 'RBRACE')].

% Assignment -> Identifier <[Expression]> = Expression;
s_assignment(assignment(Target, AssignOpTok, Source, SemiTok)) -->
s_identifier(Target),
[(AssignOpTok, 'ASSIGN')],
s_expression(Source),
[(SemiTok, 'SEMICOL')].

s_assignment(assignment(Target, LBTok, Expr, RBTok, AssignOpTok, Source, SemiTok)) -->
s_identifier(Target),
[(LBTok, 'LBRACK')],
s_expression(Expr),
[(RBTok, 'RBRACK')],
[(AssignOpTok, 'ASSIGN')],
s_expression(Source),
[(SemiTok, 'SEMICOL')].

% IfStatement -> if ( Expression ) Statement
s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock)) -->
[(IfTok, 'IF')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(ThenBlock).

s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock, ElseTok, ElseBlock)) -->
[(IfTok, 'IF')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(ThenBlock),
[(ElseTok, 'ELSE')],
s_statement(ElseBlock).

% WhileStatement -> while ( Expression ) Statement
s_while_statement(while_statement(WhileTok, LPTok, Test, RPTok, Statement)) -->
[(WhileTok, 'WHILE')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(Statement).

% Expression -> Conjunction _{_|| Conjunction _}_
s_expression(expression(Expr)) --> s_conjunction(Expr).
s_expression(expression(Expr1, OrOpTok, Expr2)) -->
s_conjunction(Expr1),
[(OrOpTok, 'OR_OP')],
s_expression(Expr2).

% Conjunction -> Equality _{_&& Equality _}_
s_conjunction(conjunction(Conj)) --> s_equality(Conj).
s_conjunction(conjunction(Conj1, AndOpTok, Conj2)) -->
s_equality(Conj1),
[(AndOpTok, 'AND_OP')],
s_conjunction(Conj2).

% Equality -> Relation
s_equality(equality(Eq)) --> s_relation(Eq).
s_equality(equality(Eq1, EqOp, Eq2)) -->
s_relation(Eq1),
s_equ_op(EqOp),
s_relation(Eq2).

% EquOp -> == | !=
s_equ_op(equ_op(EQOp)) -->
[(EQOp, 'EQ_OP')] ;
[(EQOp, 'NE_OP')].

% Relation -> Addition
s_relation(relation(Rel)) --> s_addition(Rel).
s_relation(relation(Rel1, RO, Rel2)) --> s_addition(Rel1), s_rel_op(RO), s_addition(Rel2).

% RelOp -> < | <= | > | >=
s_rel_op(rel_op(ROp)) -->
[(ROp, 'LT_OP')] ;
[(ROp, 'LE_OP')] ;
[(ROp, 'GT_OP')] ;
[(ROp, 'GE_OP')].

% Addition -> Term _{_AddOp Term_}_
s_addition(addition(Add)) --> s_term(Add).
s_addition(addition(Add1, AOp, Add2)) -->
s_term(Add1),
s_add_op(AOp),
s_addition(Add2).

% AddOp -> + | -
s_add_op(add_op(AOp)) -->
[(AOp, 'ADD_OP')] ;
[(AOp, 'SUB_OP')].

% Term -> Factor _{_MulOp Factor_}_
s_term(term(Term)) --> s_factor(Term).
s_term(term(Term1, MOp, Term2)) -->
s_factor(Term1),
s_mul_op(MOp),
s_term(Term2).

% MulOp -> * | / | %
s_mul_op(mul_op(MOp)) -->
[(MOp, 'MUL_OP')] ;
[(MOp, 'DIV_OP')] ;
[(MOp, 'MOD_OP')].

% Factor ->  Primary
s_factor(factor(Factor)) --> s_primary(Factor).
s_factor(factor(UOp, Factor)) --> s_unary_op(UOp), s_primary(Factor).

% UnaryOp -> - | !
s_unary_op(unary_op(UOp)) -->
[(UOp, 'SUB_OP')] ;
[(UOp, 'NOT_OP')].

% Primary -> Identifier <[Expression]> | Literal | ( Expression ) | Type ( Expression )
s_primary(primary(Prim)) --> s_identifier(Prim).

s_primary(primary(Id, LBTok, Expr, RBTok)) -->
s_identifier(Id) ;
[(LBTok, 'LBRACK')],
s_expression(Expr),
[(RBTok, 'RBRACK')].

s_primary(primary(Prim)) --> s_literal(Prim).

s_primary(primary(LPTok, Expr, RPTok)) -->
[(LPTok, 'LPAREN')],
s_expression(Expr),
[(RPTok, 'RPAREN')].

s_primary(primary(Type, LPTok, Expr, RPTok)) -->
s_type(type(Type)),
[(LPTok, 'LPAREN')],
s_expression(Expr),
[(RPTok, 'RPAREN')].

% Identifier -> Letter _{_ Letter | Digit _}_
% Our lexer already handles this.
s_identifier(identifier(Ident)) --> [(Ident, 'IDENT')].

% Literal -> Integer | Boolean | Float | Char
% Our lexer handled these already.
s_literal(literal(Val)) -->
s_intvalue(Val) ;
s_floatvalue(Val) ;
s_boolvalue(Val) ;
s_charvalue(Val).

s_intvalue(integer(Val)) --> [(Val, 'INTEGER')].
s_boolvalue(boolean(Val)) --> [(Val, 'BOOLVAL')].
s_floatvalue(float(Val)) --> [(Val, 'FLOAT')].
s_charvalue(char(Val)) --> [(Val, 'CHAR_LIT')].

%----- Pretty Printing Functions-----%
% These handle converting our parse tree, currently a messy Prolog ground term,
% into a nicely-formatted and indented rendition.

% Initial call handler.
print_parse_tree(Term_ParseTree, OutStream) :-
write_to_codes(Term_ParseTree, Codes_ParseTree),
print_indented(Codes_ParseTree, 0, OutStream).

% Main printing routine.  Clears punctuation and adjusts indentation,
%  then prints a word and recursively calls itself.
print_indented([],_,_).
print_indented(Codes, Level, OutStream) :-
handle_punctuation(Codes, Rest, Level, NewLevel, OutStream),
print_word(Rest, Tail, NewLevel, OutStream),
print_indented(Tail, NewLevel, OutStream), !.

% handle_punctuation handles adjusting indentation and clearing punctuation from the parse tree.
handle_punctuation([], [], _, _).
handle_punctuation([40|Codes], Rest, Level, NewLevel, OutStream) :-     % Case (
NewLevel is Level + 1,
handle_punctuation(Codes, Rest, NewLevel, _, OutStream), !.
handle_punctuation([123|Codes], Rest, Level, FinalLevel, OutStream) :-  % Case {
NewLevel is Level + 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([41|Codes], Rest, Level, FinalLevel, OutStream) :-   % Case )
NewLevel is Level - 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([125|Codes], Rest, Level, FinalLevel, OutStream) :-  % Case }
NewLevel is Level - 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([44|Codes], Rest, Level, FinalLevel, OutStream) :-   % Case ,
handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !.
handle_punctuation([32|Codes], Rest, Level, FinalLevel, OutStream) :-   % Case ' '
handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !.

handle_punctuation(Codes, Codes, Level, Level, _). % No punctuation found.

% print_word handles indentation, calls the word-fetcher, and outputs a newline.
print_word([], Rest, _, _) :- Rest = [].
print_word(Codes, Rest, Level, OutStream) :-
NumTabs is Level * 2,
tab(OutStream, NumTabs),
get_word(Codes, Rest, Level, OutStream),
put_char(OutStream, 'n'), !.

% get_word builds a string until the specified punctuation is found.
get_word([], _, _, _).                              % End of word
get_word([40|Codes], [40|Codes], _, _) :- !.		% (
get_word([41|Codes], [41|Codes], _, _) :- !.		% )
get_word([44|Codes], [44|Codes], _, _) :- !.		% ,
get_word([32|Codes], [32|Codes], _, _) :- !.		% ' '
get_word([123|Codes], [123|Codes], _, _) :- !.		% {
get_word([125|Codes], [125|Codes], _, _) :- !. 		% }
get_word([Char|Codes], Rest, Level, OutStream) :-
put_code(OutStream, Char),
get_word(Codes, Rest, Level, OutStream).
About the Author: Justin
A 34 year old Software Engineer in Seattle, WA with a love for coding, music, video games, and the great outdoors.
Author Website: http://www.justinmangue.com

Leave a Reply

Your email address will not be published. Required fields are marked *