Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored RLang language module to new framework #1864

Merged
merged 4 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions languages/rlang/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
<groupId>org.antlr</groupId>
<artifactId>antlr4-runtime</artifactId>
</dependency>
<dependency>
<groupId>de.jplag</groupId>
<artifactId>language-antlr-utils</artifactId>
<version>${revision}</version>
</dependency>
</dependencies>

<build>
Expand Down
304 changes: 177 additions & 127 deletions languages/rlang/src/main/antlr4/de/jplag/rlang/grammar/R.g4
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[The "BSD licence"]
Copyright (c) 2013 Terence Parr
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
Expand All @@ -12,6 +13,7 @@
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
Expand All @@ -27,28 +29,27 @@
/**
derived from http://svn.r-project.org/R/trunk/src/main/gram.y
http://cran.r-project.org/doc/manuals/R-lang.html#Parser

I'm no R genius but this seems to work.

Requires RFilter.g4 to strip away NL that are really whitespace,
not end-of-command. See TestR.java

Usage:

$ antlr4 R.g4 RFilter.g4
$ javac *.java
$ java TestR sample.R
... prints parse tree ...
*/

/*
Modified version of the original in https://github.com/antlr/grammars-v4/blob/master/r/R.g4 so that I can separate the most relevant tokens of R in
the JplagRListenter.java file.
Author of the modification: Antonio Javier Rodriguez Perez
*/
// $antlr-format alignTrailingComments true, columnLimit 150, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine false, allowShortBlocksOnASingleLine true, alignSemicolons hanging, alignColons hanging

grammar R;

prog: ( expr (';'|NL)
| NL
)*
EOF
prog
: ((SEMICOLON | NL)+ | expr )* EOF
;

/*
Expand All @@ -58,159 +59,208 @@ expr_or_assign
;
*/

expr: expr index_statement // '[[' follows R's yacc grammar
| expr access_package expr
| expr ('$'|'@') expr
| <assoc=right> expr '^' expr
| ('-'|'+') expr
| expr ':' expr
| expr USER_OP expr // anything wrappedin %: '%' .* '%'
| expr ('*'|'/') expr
| expr ('+'|'-') expr
| expr ('>'|'>='|'<'|'<='|'=='|'!=') expr
| '!' expr
| expr ('&'|'&&') expr
| expr ('|'|'||') expr
| '~' expr
| expr '~' expr
| expr assign_value expr
| function_definition // define function
| expr function_call // call function
| compound_statement
| if_statement
| for_statement
| while_statement
| repeat_statement
| help
| next_statement
| break_statement
| '(' expr ')'
| ID
| constant
expr
: expr LIST_ACCESS_START sublist LIST_ACCESS_END #ListAccess // '[[' follows R's yacc grammar
| expr ARRAY_ACCESS_START sublist ARRAY_ACCESS_END #ArrayAccess
| expr NAMESPACE_ACCESS expr #NamespaceAccess
| expr COMPONENT_ACCESS expr #ComponentAccess
| <assoc = right> expr '^' expr #Exponent
| ADD_SUB expr #Sign
| expr RANGE_OPERATOR expr #Range
| expr USER_OP expr #UserDefinedOperation // anything wrappedin %: '%' .* '%'
| expr MULT_DIV expr #MultOrDiv
| expr ADD_SUB expr #AddOrSub
| expr COMPARATOR expr #Comparison
| NOT expr #Not
| expr AND expr #And
| expr OR expr #Or
| '~' expr #ModelFormulaePrefix
| expr '~' expr #ModelFormulaeInfix
| expr (ASSIGN | EQUALS) expr #Assignment
| FUNCTION PAREN_L formlist? PAREN_R expr #FunctionDefinition // define function
| expr PAREN_L sublist PAREN_R #FunctionCall // call function
| CURLY_L exprlist CURLY_R #CompoundStatement // compound statement
| IF PAREN_L expr PAREN_R expr #If
| IF PAREN_L expr PAREN_R expr NL* ELSE expr #IfElse
| FOR PAREN_L ID IN expr PAREN_R expr #For
| WHILE PAREN_L expr PAREN_R expr #While
| REPEAT expr #Repeat
| HELP expr #Help // get help on expr, usually string or ID
| NEXT #Next
| BREAK #Break
| PAREN_L expr PAREN_R #BracketTerm
| ID #Id
| STRING #String
| HEX #Hex
| INT #Int
| FLOAT #Float
| COMPLEX #Complex
| NULL #Null
| NA #Na
| INF #Inf
| NAN #Nan
| TRUE #True
| FALSE #False
| NL+ expr #Newline
;

index_statement : '[[' sublist ']' ']' | '[' sublist ']' ;

access_package: '::'|':::' ;

function_definition: 'function' '(' formlist? ')' expr ;

function_call : '(' sublist ')' ;

constant: constant_number | constant_string | constant_bool | 'NULL' | 'NA' | 'Inf' | 'NaN' ;

constant_number: HEX | INT | FLOAT | COMPLEX ;

constant_string: STRING ;

constant_bool: 'TRUE' | 'FALSE' ;

help: '?' expr ; // get help on expr, usually string or ID

if_statement : 'if' '(' expr ')' expr | 'if' '(' expr ')' expr 'else' expr ;

for_statement : 'for' '(' ID 'in' expr ')' expr ;

while_statement : 'while' '(' expr ')' expr ;

repeat_statement: 'repeat' expr ;

next_statement: 'next' ;

break_statement: 'break' ;

compound_statement: '{' exprlist '}' ;

exprlist
: expr ((';'|NL) expr?)*
|
: expr ((SEMICOLON | NL) expr?)*
;

formlist : form (',' form)* ;
formlist
: form (',' form)*
;

form: ID
| assign_func_declaration
form
: ID
| ID EQUALS expr
| '...'
| '.'
;

sublist : sub (',' sub)* ;
sublist
: sub (',' sub)*
;

sub : expr
| assign_value_list
sub
: expr
| ID EQUALS
| ID EQUALS expr
| STRING EQUALS
| STRING EQUALS expr
| NULL EQUALS
| NULL EQUALS expr
| '...'
| '.'
|
;

assign_value: '<-'|'<<-'|'='|'->'|'->>'|':=';

assign_func_declaration: ID '=' expr | '...' ;

assign_value_list: ID '=' | ID '=' expr | constant_string '=' | constant_string '=' expr | 'NULL' '=' | 'NULL' '=' expr | '...' ;



HEX : '0' ('x'|'X') HEXDIGIT+ [Ll]? ;
IF: 'if';
FOR: 'for';
WHILE: 'while';
REPEAT: 'repeat';
FUNCTION: 'function';
ELSE: 'else';
IN: 'in';

LIST_ACCESS_START: '[[';
LIST_ACCESS_END: ']]';
ARRAY_ACCESS_START: '[';
ARRAY_ACCESS_END: ']';
NAMESPACE_ACCESS: ':::' | '::';
COMPONENT_ACCESS: '$' | '@';

HELP: '?';
NEXT: 'next';
BREAK: 'break';

NULL: 'NULL';
NA: 'NA';
INF: 'inf';
NAN: 'NaN';
TRUE: 'TRUE';
FALSE: 'FALSE';

NOT: '!';
RANGE_OPERATOR: ':';

MULT_DIV: '*' | '/';
ADD_SUB: '+' | '-';
COMPARATOR: '>' | '>=' | '<' | '<=' | '==' | '!=';
ASSIGN: '<-' | '<<-' | '->' | '->>' | ':=';
EQUALS: '=';
AND: '&&' | '&';
OR: '||' | '|';

PAREN_L: '(';
PAREN_R: ')';
CURLY_L: '{';
CURLY_R: '}';

HEX
: '0' ('x' | 'X') HEXDIGIT+ [Ll]?
;

INT : DIGIT+ [Ll]? ;
INT
: DIGIT+ [Ll]?
;

fragment
HEXDIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
fragment HEXDIGIT
: ('0' ..'9' | 'a' ..'f' | 'A' ..'F')
;

FLOAT: DIGIT+ '.' DIGIT* EXP? [Ll]?
| DIGIT+ EXP? [Ll]?
| '.' DIGIT+ EXP? [Ll]?
FLOAT
: DIGIT+ '.' DIGIT* EXP? [Ll]?
| DIGIT+ EXP? [Ll]?
| '.' DIGIT+ EXP? [Ll]?
;

fragment
DIGIT: '0'..'9' ;
fragment DIGIT
: '0' ..'9'
;

fragment
EXP : ('E' | 'e') ('+' | '-')? INT ;
fragment EXP
: ('E' | 'e') ('+' | '-')? INT
;

COMPLEX
: INT 'i'
| FLOAT 'i'
: INT 'i'
| FLOAT 'i'
;

STRING
: '"' ( ESC | ~[\\"] )*? '"'
| '\'' ( ESC | ~[\\'] )*? '\''
| '`' ( ESC | ~[\\'] )*? '`'
: '"' (ESC | ~[\\"])*? '"'
| '\'' ( ESC | ~[\\'])*? '\''
| '`' ( ESC | ~[\\'])*? '`'
;
fragment
ESC : '\\' [abtnfrv"'\\]
| UNICODE_ESCAPE
| HEX_ESCAPE
| OCTAL_ESCAPE

fragment ESC
: '\\' [abtnfrv"'\\]
| UNICODE_ESCAPE
| HEX_ESCAPE
| OCTAL_ESCAPE
;

fragment
UNICODE_ESCAPE
: '\\' 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
| '\\' 'u' '{' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT '}'
fragment UNICODE_ESCAPE
: '\\' 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
| '\\' 'u' '{' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT '}'
;

fragment
OCTAL_ESCAPE
: '\\' [0-3] [0-7] [0-7]
| '\\' [0-7] [0-7]
| '\\' [0-7]
fragment OCTAL_ESCAPE
: '\\' [0-3] [0-7] [0-7]
| '\\' [0-7] [0-7]
| '\\' [0-7]
;

fragment
HEX_ESCAPE
: '\\' HEXDIGIT HEXDIGIT?
fragment HEX_ESCAPE
: '\\' HEXDIGIT HEXDIGIT?
;

ID : '.' (LETTER|'_'|'.') (LETTER|DIGIT|'_'|'.')*
| LETTER (LETTER|DIGIT|'_'|'.')*
ID
: '.' (LETTER | '_' | '.') (LETTER | DIGIT | '_' | '.')*
| LETTER (LETTER | DIGIT | '_' | '.')*
;

fragment LETTER : [a-zA-Z] ;

USER_OP : '%' .*? '%' ;
fragment LETTER
: [a-zA-Z]
;

COMMENT : '#' .*? '\r'? '\n' -> type(NL) ;
USER_OP
: '%' .*? '%'
;

COMMENT
: '#' .*? '\r'? '\n' -> type(NL)
;

// Match both UNIX and Windows newlines
NL : '\r'? '\n' ;
NL
: '\r'? '\n'
;

SEMICOLON: ';';

WS : [ \t\u000C]+ -> skip ;
WS
: [ \t\u000C]+ -> skip
;
Loading
Loading