Prolog implementation of a C-lite parser | pixel by pixel

Justin | February 6, 2013

Prolog implementation of a C-lite parser

Wrote a Clite parser– in Prolog!

parser.pl:

% Clite Parser
% 2/6/13
% Justin Mangue
%
% Builds the parse tree for a Clite program according to the concrete syntax given in
% Tucker &amp; Noonan's "Programming Languages Principles and Paradigms" (2nd ed.) pg 38.
% Pretty-prints the result either to terminal or to file.
%
% The input file must be a file containing a list of tokens generated by my lexer.pl
% run on the original Clite source.
%
% Call with: parse('sourcefile').             to output parse tree to terminal
%            parse('sourcefile','destfile').  to output parse to specified file.
 
% Starting call, Output to terminal.
parse(FileIn) :-
runparser(FileIn, user_output).
 
% Starting call, Output to file.
parse(FileIn, FileOut) :-
open(FileOut, write, OutFileStream),
runparser(FileIn, OutFileStream),
close(OutFileStream).
 
% Main Routine
runparser(FileIn, OutStream) :-
open(FileIn, read, InFileStream),
build_token_list(InFileStream, TokenList),
close(InFileStream),
build_parse_tree(TokenList, ParseTree),
print_parse_tree(ParseTree, OutStream).
 
%----- Token List Functions-----%
% Converts the token stream into a list of pairs of format (Token, Token_Type).
 
build_token_list(InStream, _) :-
at_end_of_stream(InStream), !.
 
build_token_list(InStream, TokenList) :-
+ at_end_of_stream(InStream),
readToken(InStream, Token),
readToken(InStream, Token_Type),
build_token_list(InStream, NextToken),
TokenList = [(Token, Token_Type) | NextToken].
 
% Grabs the next ASCII code out of the stream and builds into a Word, then unifies it back into an atom.
readToken(InStream, Word):-
get_code(InStream, Char),
buildtoken(Char, Chars, InStream),
atom_codes(Word, Chars).
 
% The following ASCII codes indicate the end of a word.
buildtoken(10,[],_):-  !.          % newline
buildtoken(9,[],_):-  !.           % tab
buildtoken(32,[],_):-  !.          % space
buildtoken(-1,[],_):-  !.          % end of stream
buildtoken(end_of_file,[],_):-  !.  % eof
 
% Grab chars and append them to our list until we hit the end of a word, as defined above.
buildtoken(Char,[Char|Rest],InStream):-
get_code(InStream,NextChar),
buildtoken(NextChar,Rest,InStream).
 
%----- Token List Functions-----%
 
% Initial call to the DCG functionality
build_parse_tree(Tokens, ParseTree) :-
s_program(ParseTree, Tokens, []).
 
%----- DCG Definitions for Clite-----%
% Used &lt; &gt; in place of EBNF metabrackets in the comments,
% and _{_ _}_ to designate EBNF metabraces.
 
% Had to make 4 different definitions for Program to allow for programs with no declarations and/or statements.
% This is a bit of a workaround to keep the parse tree tidy.
 
% Program -&gt; int main ( ) { Declarations Statements }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), statements(Statements), RCBTok)) --&gt;
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_declarations(Declarations),
s_statements(Statements),
[(RCBTok, 'RBRACE')], !.
 
% Program -&gt; int main ( ) { Declarations }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, declarations(Declarations), RCBTok)) --&gt;
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_declarations(Declarations),
[(RCBTok, 'RBRACE')], !.
 
% Program -&gt; int main ( ) { Statements }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, statements(Statements), RCBTok)) --&gt;
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
s_statements(Statements),
[(RCBTok, 'RBRACE')], !.
 
% Program -&gt; int main ( ) { }
s_program(program(IntTok, MainTok, LPTok, RPTok, LCBTok, RCBTok)) --&gt;
[(IntTok, 'INT')],
[(MainTok, 'MAIN')],
[(LPTok, 'LPAREN')],
[(RPTok, 'RPAREN')],
[(LCBTok, 'LBRACE')],
[(RCBTok, 'RBRACE')], !.
 
% Declarations -&gt; _{_ Declaration _}_
s_declarations(Decl) --&gt; s_declaration(Decl).
s_declarations((Decl, MoreDecls)) --&gt;
s_declaration(Decl),
s_declarations(MoreDecls).
s_declarations(_) --&gt; [].
 
% Declaration -&gt; Type Identifier &lt;[Integer]&gt;_{_, Identifier &lt;[Integer]&gt;_}_;
s_declaration(declaration(Type, Ident, SemiTok)) --&gt;
s_type(Type),
s_identifier(Ident),
[(SemiTok, 'SEMICOL')].
 
s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, SemiTok)) --&gt;
s_type(Type),
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(SemiTok, 'SEMICOL')].
 
s_declaration(declaration(Type, Ident, CommaTok, NextDecl)) --&gt;
s_type(Type),
s_identifier(Ident),
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
 
s_declaration(declaration(Type, Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) --&gt;
s_type(Type),
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
 
s_declaration_commaseparatedlist(comma(Ident, SemiTok)) --&gt;
s_identifier(Ident),
[(SemiTok, 'SEMICOL')].
 
s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, SemiTok)) --&gt;
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(SemiTok, 'SEMICOL')].
 
s_declaration_commaseparatedlist(comma(Ident, CommaTok, NextDecl)) --&gt;
s_identifier(Ident),
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
 
s_declaration_commaseparatedlist(comma(Ident, LBTok, Int, RBTok, CommaTok, NextDecl)) --&gt;
s_identifier(Ident),
[(LBTok, 'LBRACK')],
s_intvalue(Int),
[(RBTok, 'RBRACK')],
[(CommaTok, 'COMMA')],
s_declaration_commaseparatedlist(NextDecl).
 
s_type(type(TypeTok)) --&gt;
[(TypeTok, 'INT')] ;
[(TypeTok, 'BOOL')] ;
[(TypeTok, 'FLOAT')] ;
[(TypeTok, 'CHAR')].
 
% Statements -&gt; _{_Statement_}_
s_statements(Statement) --&gt; s_statement(Statement).
s_statements((Statement, MoreStatements)) --&gt; s_statement(Statement), s_statements(MoreStatements).
s_statements(_) --&gt; [].
 
% Statement -&gt; ; | Block | Assignment | IfStatement | WhileStatement
s_statement(statement(SemiColTok)) --&gt; [(SemiColTok, 'SEMICOL')].
s_statement(statement(Block)) --&gt; s_block(Block).
s_statement(statement(Assign)) --&gt; s_assignment(Assign).
s_statement(statement(IfStatement)) --&gt; s_if_statement(IfStatement).
s_statement(statement(WhileStatement)) --&gt; s_while_statement(WhileStatement).
 
% Block -&gt; { Statements }
s_block(block(LCBTok, Statements, RCBTok)) --&gt;
[(LCBTok, 'LBRACE')],
s_statements(Statements),
[(RCBTok, 'RBRACE')].
 
% Assignment -&gt; Identifier &lt;[Expression]&gt; = Expression;
s_assignment(assignment(Target, AssignOpTok, Source, SemiTok)) --&gt;
s_identifier(Target),
[(AssignOpTok, 'ASSIGN')],
s_expression(Source),
[(SemiTok, 'SEMICOL')].
 
s_assignment(assignment(Target, LBTok, Expr, RBTok, AssignOpTok, Source, SemiTok)) --&gt;
s_identifier(Target),
[(LBTok, 'LBRACK')],
s_expression(Expr),
[(RBTok, 'RBRACK')],
[(AssignOpTok, 'ASSIGN')],
s_expression(Source),
[(SemiTok, 'SEMICOL')].
 
% IfStatement -&gt; if ( Expression ) Statement
s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock)) --&gt;
[(IfTok, 'IF')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(ThenBlock).
 
s_if_statement(if_statement(IfTok, LPTok, Test, RPTok, ThenBlock, ElseTok, ElseBlock)) --&gt;
[(IfTok, 'IF')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(ThenBlock),
[(ElseTok, 'ELSE')],
s_statement(ElseBlock).
 
% WhileStatement -&gt; while ( Expression ) Statement
s_while_statement(while_statement(WhileTok, LPTok, Test, RPTok, Statement)) --&gt;
[(WhileTok, 'WHILE')],
[(LPTok, 'LPAREN')],
s_expression(Test),
[(RPTok, 'RPAREN')],
s_statement(Statement).
 
% Expression -&gt; Conjunction _{_|| Conjunction _}_
s_expression(expression(Expr)) --&gt; s_conjunction(Expr).
s_expression(expression(Expr1, OrOpTok, Expr2)) --&gt;
s_conjunction(Expr1),
[(OrOpTok, 'OR_OP')],
s_expression(Expr2).
 
% Conjunction -&gt; Equality _{_&amp;&amp; Equality _}_
s_conjunction(conjunction(Conj)) --&gt; s_equality(Conj).
s_conjunction(conjunction(Conj1, AndOpTok, Conj2)) --&gt;
s_equality(Conj1),
[(AndOpTok, 'AND_OP')],
s_conjunction(Conj2).
 
% Equality -&gt; Relation
s_equality(equality(Eq)) --&gt; s_relation(Eq).
s_equality(equality(Eq1, EqOp, Eq2)) --&gt;
s_relation(Eq1),
s_equ_op(EqOp),
s_relation(Eq2).
 
% EquOp -&gt; == | !=
s_equ_op(equ_op(EQOp)) --&gt;
[(EQOp, 'EQ_OP')] ;
[(EQOp, 'NE_OP')].
 
% Relation -&gt; Addition
s_relation(relation(Rel)) --&gt; s_addition(Rel).
s_relation(relation(Rel1, RO, Rel2)) --&gt; s_addition(Rel1), s_rel_op(RO), s_addition(Rel2).
 
% RelOp -&gt; &lt; | &lt;= | &gt; | &gt;=
s_rel_op(rel_op(ROp)) --&gt;
[(ROp, 'LT_OP')] ;
[(ROp, 'LE_OP')] ;
[(ROp, 'GT_OP')] ;
[(ROp, 'GE_OP')].
 
% Addition -&gt; Term _{_AddOp Term_}_
s_addition(addition(Add)) --&gt; s_term(Add).
s_addition(addition(Add1, AOp, Add2)) --&gt;
s_term(Add1),
s_add_op(AOp),
s_addition(Add2).
 
% AddOp -&gt; + | -
s_add_op(add_op(AOp)) --&gt;
[(AOp, 'ADD_OP')] ;
[(AOp, 'SUB_OP')].
 
% Term -&gt; Factor _{_MulOp Factor_}_
s_term(term(Term)) --&gt; s_factor(Term).
s_term(term(Term1, MOp, Term2)) --&gt;
s_factor(Term1),
s_mul_op(MOp),
s_term(Term2).
 
% MulOp -&gt; * | / | %
s_mul_op(mul_op(MOp)) --&gt;
[(MOp, 'MUL_OP')] ;
[(MOp, 'DIV_OP')] ;
[(MOp, 'MOD_OP')].
 
% Factor -&gt;  Primary
s_factor(factor(Factor)) --&gt; s_primary(Factor).
s_factor(factor(UOp, Factor)) --&gt; s_unary_op(UOp), s_primary(Factor).
 
% UnaryOp -&gt; - | !
s_unary_op(unary_op(UOp)) --&gt;
[(UOp, 'SUB_OP')] ;
[(UOp, 'NOT_OP')].
 
% Primary -&gt; Identifier &lt;[Expression]&gt; | Literal | ( Expression ) | Type ( Expression )
s_primary(primary(Prim)) --&gt; s_identifier(Prim).
 
s_primary(primary(Id, LBTok, Expr, RBTok)) --&gt;
s_identifier(Id) ;
[(LBTok, 'LBRACK')],
s_expression(Expr),
[(RBTok, 'RBRACK')].
 
s_primary(primary(Prim)) --&gt; s_literal(Prim).
 
s_primary(primary(LPTok, Expr, RPTok)) --&gt;
[(LPTok, 'LPAREN')],
s_expression(Expr),
[(RPTok, 'RPAREN')].
 
s_primary(primary(Type, LPTok, Expr, RPTok)) --&gt;
s_type(type(Type)),
[(LPTok, 'LPAREN')],
s_expression(Expr),
[(RPTok, 'RPAREN')].
 
% Identifier -&gt; Letter _{_ Letter | Digit _}_
% Our lexer already handles this.
s_identifier(identifier(Ident)) --&gt; [(Ident, 'IDENT')].
 
% Literal -&gt; Integer | Boolean | Float | Char
% Our lexer handled these already.
s_literal(literal(Val)) --&gt;
s_intvalue(Val) ;
s_floatvalue(Val) ;
s_boolvalue(Val) ;
s_charvalue(Val).
 
s_intvalue(integer(Val)) --&gt; [(Val, 'INTEGER')].
s_boolvalue(boolean(Val)) --&gt; [(Val, 'BOOLVAL')].
s_floatvalue(float(Val)) --&gt; [(Val, 'FLOAT')].
s_charvalue(char(Val)) --&gt; [(Val, 'CHAR_LIT')].
 
%----- Pretty Printing Functions-----%
% These handle converting our parse tree, currently a messy Prolog ground term,
% into a nicely-formatted and indented rendition.
 
% Initial call handler.
print_parse_tree(Term_ParseTree, OutStream) :-
write_to_codes(Term_ParseTree, Codes_ParseTree),
print_indented(Codes_ParseTree, 0, OutStream).
 
% Main printing routine.  Clears punctuation and adjusts indentation,
%  then prints a word and recursively calls itself.
print_indented([],_,_).
print_indented(Codes, Level, OutStream) :-
handle_punctuation(Codes, Rest, Level, NewLevel, OutStream),
print_word(Rest, Tail, NewLevel, OutStream),
print_indented(Tail, NewLevel, OutStream), !.
 
% handle_punctuation handles adjusting indentation and clearing punctuation from the parse tree.
handle_punctuation([], [], _, _).
handle_punctuation([40|Codes], Rest, Level, NewLevel, OutStream) :-     % Case (
NewLevel is Level + 1,
handle_punctuation(Codes, Rest, NewLevel, _, OutStream), !.
handle_punctuation([123|Codes], Rest, Level, FinalLevel, OutStream) :-  % Case {
NewLevel is Level + 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([41|Codes], Rest, Level, FinalLevel, OutStream) :-   % Case )
NewLevel is Level - 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([125|Codes], Rest, Level, FinalLevel, OutStream) :-  % Case }
NewLevel is Level - 1,
handle_punctuation(Codes, Rest, NewLevel, FinalLevel, OutStream), !.
handle_punctuation([44|Codes], Rest, Level, FinalLevel, OutStream) :-   % Case ,
handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !.
handle_punctuation([32|Codes], Rest, Level, FinalLevel, OutStream) :-   % Case ' '
handle_punctuation(Codes, Rest, Level, FinalLevel, OutStream), !.
 
handle_punctuation(Codes, Codes, Level, Level, _). % No punctuation found.
 
% print_word handles indentation, calls the word-fetcher, and outputs a newline.
print_word([], Rest, _, _) :- Rest = [].
print_word(Codes, Rest, Level, OutStream) :-
NumTabs is Level * 2,
tab(OutStream, NumTabs),
get_word(Codes, Rest, Level, OutStream),
put_char(OutStream, 'n'), !.
 
% get_word builds a string until the specified punctuation is found.
get_word([], _, _, _).                              % End of word
get_word([40|Codes], [40|Codes], _, _) :- !.        % (
get_word([41|Codes], [41|Codes], _, _) :- !.        % )
get_word([44|Codes], [44|Codes], _, _) :- !.        % ,
get_word([32|Codes], [32|Codes], _, _) :- !.        % ' '
get_word([123|Codes], [123|Codes], _, _) :- !.      % {
get_word([125|Codes], [125|Codes], _, _) :- !.      % }
get_word([Char|Codes], Rest, Level, OutStream) :-
put_code(OutStream, Char),
get_word(Codes, Rest, Level, OutStream).

Category: Computing | Tags: c, code, programming, prolog

About the Author: Justin

A 34 year old Software Engineer in Seattle, WA with a love for coding, music, video games, and the great outdoors.

Author Website: http://www.justinmangue.com

Leave a Reply Cancel reply