The grammar I need to create is based on the following:
Command lines start with a slash
Command lines can be continued with a hyphen as the last character
(excluding whitespaces) on a line
For some commands I want to parse their parameters
For other commands I am not interested in their parameters
This works almost fine with the following (simplified) Lexer
lexer grammar T1Lexer;
NewLine
: [\r\n]+ -> skip
;
CommandStart
: '/' -> pushMode(CommandMode)
;
DataStart
: . -> more, pushMode(DataMode)
;
mode DataMode;
DataLine
: ~[\r\n]+ -> popMode
;
mode CommandMode;
CmNL
: [\r\n]+ -> skip, popMode
;
CONTINUEMINUS : ( '-' [ ]* ('\r/' | '\n/' | '\r\n/') ) -> channel(HIDDEN);
EOL: ( [ ]* ('\r' | '\n' | '\r\n') ) -> popMode;
SPACE : [ \t\r\n]+ -> channel(HIDDEN) ;
DOT : [.] ;
COMMA : ',' ;
CMD1 : 'CMD1';
CMD2 : 'CMD2';
CMDIGN : 'CMDIGN' -> pushMode(DataMode) ;
VAR1 : 'VAR1=' ;
ID : ID_LITERAL;
fragment ID_LITERAL: [A-Z_$0-9]*?[A-Z_$]+?[A-Z_$0-9]*;
and Parser:
parser grammar T1Parser;
options { tokenVocab=T1Lexer; }
root : line+ EOF ;
line: ( commandLine | dataLine)+ ;
dataLine : DataLine ;
commandLine : CommandStart command ;
command : cmd1 | cmd2 | cmdign ;
cmd1 : CMD1 (VAR1 ID)+ ;
cmd2 : CMD2 (VAR1 ID)+ ;
cmdign : CMDIGN DataLine ;
The problem arises where I need a combination of 2. + 4., i.e. continuation for a command where I want to simply get the parms as an unparsed String (lines 5+6 in the example).
When I push to DataMode for CMDIGN on line 5 the continuation character is not recognized as it is swallowed by the "any until EOL" rule, so I pop back to default mode and the continuation line is considered a new command and fails to parse.
Is there a way of handling this combo properly ?
TIA - Alex
(For your example) You don't really need a CommandMode; it actually complicates things a bit.
T1Lexer.g4:
lexer grammar T1Lexer
;
CMD_START: '/';
CONTINUE_EOL_SLASH: '-' EOL_F '/' -> channel(HIDDEN);
EOL: EOL_F;
WS: [ \t]+ -> channel(HIDDEN);
DOT: [.];
COMMA: ',';
CMD1: 'CMD1';
CMD2: 'CMD2';
CMDIGN: 'CMDIGN' -> pushMode(DataMode);
VAR1: 'VAR1=';
ID: ID_LITERAL;
//=======================================
mode DataMode
;
DM_EOL: EOL_F -> type(EOL), popMode;
DATA_LINE: ( ~[\r\n]*? '-' EOL_F)* ~[\r\n]+;
//=======================================
fragment NL: '\r'? '\n';
fragment EOL_F: [ ]* NL;
fragment ID_LITERAL: [A-Z_$0-9]*? [A-Z_$]+? [A-Z_$0-9]*;
T1Parser.g4
parser grammar T1Parser
;
options {
tokenVocab = T1Lexer;
}
root: line (EOL line)* EOL? EOF;
line: commandLine | dataLine | emptyLine;
dataLine: DATA_LINE;
commandLine: CMD_START command;
emptyLine: CMD_START;
command: cmd1 | cmd2 | cmdign;
cmd1: CMD1 (VAR1 ID)+;
cmd2: CMD2 (VAR1 ID)+;
cmdign: CMDIGN DATA_LINE?;
Test Input:
/ CMD1 VAR1=VAL1 VAR1=VAL2
/ CMDIGN VAR1=BLAH VAR2=BLAH
/ CMD2 VAR1=VAL12 -
/ VAR1=VAL22
/ CMDIGN
/
/ CMDIGN VAR-1=0 -
/ VAR2=notignored
Token Stream:
[#0,0:0='/',<'/'>,1:0]
[#1,1:1=' ',<WS>,channel=1,1:1]
[#2,2:5='CMD1',<'CMD1'>,1:2]
[#3,6:6=' ',<WS>,channel=1,1:6]
[#4,7:11='VAR1=',<'VAR1='>,1:7]
[#5,12:15='VAL1',<ID>,1:12]
[#6,16:16=' ',<WS>,channel=1,1:16]
[#7,17:21='VAR1=',<'VAR1='>,1:17]
[#8,22:25='VAL2',<ID>,1:22]
[#9,26:26='\n',<EOL>,1:26]
[#10,27:27='/',<'/'>,2:0]
[#11,28:28=' ',<WS>,channel=1,2:1]
[#12,29:34='CMDIGN',<'CMDIGN'>,2:2]
[#13,35:54=' VAR1=BLAH VAR2=BLAH',<DATA_LINE>,2:8]
[#14,55:55='\n',<EOL>,2:28]
[#15,56:56='/',<'/'>,3:0]
[#16,57:57=' ',<WS>,channel=1,3:1]
[#17,58:61='CMD2',<'CMD2'>,3:2]
[#18,62:62=' ',<WS>,channel=1,3:6]
[#19,63:67='VAR1=',<'VAR1='>,3:7]
[#20,68:72='VAL12',<ID>,3:12]
[#21,73:73=' ',<WS>,channel=1,3:17]
[#22,74:76='-\n/',<CONTINUE_EOL_SLASH>,channel=1,3:18]
[#23,77:82=' ',<WS>,channel=1,4:1]
[#24,83:87='VAR1=',<'VAR1='>,4:7]
[#25,88:92='VAL22',<ID>,4:12]
[#26,93:93='\n',<EOL>,4:17]
[#27,94:94='/',<'/'>,5:0]
[#28,95:95=' ',<WS>,channel=1,5:1]
[#29,96:101='CMDIGN',<'CMDIGN'>,5:2]
[#30,102:102='\n',<EOL>,5:8]
[#31,103:103='/',<'/'>,6:0]
[#32,104:104='\n',<EOL>,6:1]
[#33,105:105='/',<'/'>,7:0]
[#34,106:106=' ',<WS>,channel=1,7:1]
[#35,107:112='CMDIGN',<'CMDIGN'>,7:2]
[#36,113:150=' VAR-1=0 - \n/
tree output:
(root
(line
(commandLine
/
(command
(cmd1 CMD1 VAR1= VAL1 VAR1= VAL2)
)
)
)
\n
(line
(commandLine
/
(command
(cmdign CMDIGN VAR1=BLAH VAR2=BLAH)
)
)
)
\n
(line
(commandLine
/
(command
(cmd2 CMD2 VAR1= VAL12 VAR1= VAL22)
)
)
)
\n
(line
(commandLine
/
(command
(cmdign CMDIGN)
)
)
)
\n
(line
(emptyLine /)
)
\n
(line
(commandLine
/
(command
(cmdign CMDIGN VAR-1=0 - \n/ VAR2=notignored)
)
)
)
<EOF>
)
I'm reading compiler book that Jhon R. Levine's lex and yacc
Even after following the first example in Chapter 3, the first example is as follows.
/*parser.y*/
%token NAME NUMBER
%%
statement: NAME '=' expression
| expression { printf("= %d\n", $1); }
;
expression: expression '+' NUMBER { $$ = $1 + $3; }
| expression '-' NUMBER { $$ = $1 - $3; }
| NUMBER { $$ = $1; }
;
/*parser.l*/
%{
#include "y.tab.h"
extern int yylval;
%}
%%
[0-9]+ { yylval = atoi(yytext); return NUMBER; }
[ \t] ; /* ignore whitespace */
\n return 0; /* logical EOF */
. {yyerror("error!! %c\n",*yytext);}
%%
And when I ran these two files on the terminal, the following error came out while compiling the two files.
$ yacc -d parser.y
$ lex parser.l
$ cc -o parser y.tab.c lex.yy.c -ly -ll
/usr/bin/ld: cannot find -ly
collect2: error: ld returned 1 exit status
What should I do?
Please help me.
#include<iostream>
#include<string>
using namespace std;
int main()
{
string name[] = {"Geeks ", "for", "Geeks"} ;
string v,s ;
v += name[0][0] + "." ;
cout << v << "\n" ;
s += name[0][0] ;
s += "." ;
cout << s << "\n" ;
}
On outputting:
string v outputs a garbage string while string s outputs G.
Output :
# // output of string v
G. // output of string s
Why does v give out a garbage value even though I'm appending '.' to the new string ?
Kindly explain both the cases.
Clang will complain about:
v += name[0][0] + "." ;
with a message like:
warning: adding 'std::...' (aka 'char') to a string does not append to the string [-Wstring-plus-int]
In other words, string + "chars" does not append the chars to the string, and so you need to create the string manually:
v += name[0][0] + string(".") ;
My grammar (as follows (trimmed down from the original)) requires somewhat overlapping rules
grammar NOVIANum;
statement : (priorityStatement | integerStatement)* ;
priorityStatement : T_PRIO TwoDigits ;
integerStatement : T_INTEGER Integer ;
WS : [ \t\r\n]+ -> skip ;
T_PRIO : 'PRIO' ;
T_INTEGER : 'INTEGER' ;
Integer: OneToNine Digit* | ZERO ;
TwoDigits : Digit Digit ;
fragment OneToNine : ('1'..'9') ;
fragment Digit: ('0'..'9');
ZERO : [0] ;
so "Integer" and "TwoDigits" overlap to a certain extent.
The following input
INTEGER 10
PRIO 10
results in
line 2:5 mismatched input '10' expecting TwoDigits
when Integer precedes TwoDigits and in
line 1:8 mismatched input '10' expecting Integer
when TwoDigits precedes Integer in the grammar.
Is there a way around this ?
Thanks - Alex
Edit:
Thanks #GRosenberg, your suggestion, of course, worked for this small example, but when I integrated this into my full grammar it led to different mismatched input errors sure enough.
The reason being another lexer rule which requires a range of '[1-4]', so I thought I'll be clever and turn it into
grammar NOVIANum;
statement : (priorityT | integerT | levelT )* ;
priorityT : T_PRIO twoDigits ;
integerT : T_INTEGER integer ;
levelT : T_LEVEL levelNumber ;
levelNumber : ( ZERO DIGIT ) | ( OneToFour (ZERO | DIGIT) ) ;
integer: ZERO* ( DIGIT ( DIGIT | ZERO )* ) ;
twoDigits : (ZERO | DIGIT) ( ZERO | DIGIT ) ;
oneToFour : OneToFour (DIGIT | ZERO) ;
WS : [ \t\r\n]+ -> skip ;
T_INTEGER : 'INTEGER' ;
T_LEVEL : 'LEVEL' ;
T_PRIO : 'PRIO' ;
DIGIT: OneToFour | FiveToNine ;
ZERO : '0' ;
OneToFour : [1-4] ;
FiveToNine : [5-9] ;
This still works for the previous inputs but ...
INTEGER 350
PRIO 10
LEVEL 01
LEVEL 05
LEVEL 10
LEVEL 49
results in
[#0,0:6='INTEGER',<2>,1:0]
[#1,8:8='3',<5>,1:8]
[#2,9:9='5',<5>,1:9]
[#3,10:10='0',<6>,1:10]
[#4,12:15='PRIO',<4>,2:0]
[#5,17:17='1',<5>,2:5]
[#6,18:18='0',<6>,2:6]
[#7,20:24='LEVEL',<3>,3:0]
[#8,26:26='0',<6>,3:6]
[#9,27:27='1',<5>,3:7]
[#10,29:33='LEVEL',<3>,4:0]
[#11,35:35='0',<6>,4:6]
[#12,36:36='5',<5>,4:7]
[#13,38:42='LEVEL',<3>,5:0]
[#14,44:44='1',<5>,5:6]
[#15,45:45='0',<6>,5:7]
[#16,47:51='LEVEL',<3>,6:0]
[#17,53:53='4',<5>,6:6]
[#18,54:54='9',<5>,6:7]
[#19,55:54='<EOF>',<-1>,6:8]
line 5:6 no viable alternative at input '1'
line 6:6 no viable alternative at input '4'
(statement (integerT INTEGER (integer 3 5 0)) (priorityT PRIO (twoDigits 1 0)) (levelT LEVEL (levelNumber 0 1)) (levelT LEVEL (levelNumber 0 5)) (levelT LEVEL (levelNumber 1 0)) (levelT LEVEL (levelNumber 4 9)))
What am I missing here ?
Edit 2:
Ok, answering my own question here, of course
DIGIT: OneToFour | FiveToNine ;
kicks in where it shouldn't, even in this combined form,
so about the only way to get around this - I can think of - would be
grammar NOVIANum;
statement : (priorityT | integerT | levelT )* ;
priorityT : T_PRIO twoDigits ;
integerT : T_INTEGER integer ;
levelT : T_LEVEL levelNumber ;
levelNumber : ( ZERO (OneToFour | FiveToNine) | ( OneToFour (ZERO | (OneToFour | FiveToNine)) ) ) ;
integer: ZERO* ( (OneToFour | FiveToNine) ( (OneToFour | FiveToNine) | ZERO )* ) ;
twoDigits : (ZERO | (OneToFour | FiveToNine)) ( ZERO | (OneToFour | FiveToNine) ) ;
WS : [ \t\r\n]+ -> skip ;
T_INTEGER : 'INTEGER' ;
T_LEVEL : 'LEVEL' ;
T_PRIO : 'PRIO' ;
// DIGIT: OneToFour | FiveToNine;
ZERO : '0' ;
OneToFour : [1-4] ;
FiveToNine : [5-9] ;
because when I create a parser rule for it like
oneToNine : OneToFour | FiveToNine ;
it'll give me this
integerT INTEGER (integer (oneToNine 3) (oneToNine 5) 0))
which is ugly and harder to handle than just
(integerT INTEGER (integer 3 5 0))
As an general issue of design, always try to work with distinguishing elements and their objects (T_PRIO -> TwoDigits) at the same level, parser or lexer. Presuming the semantic nature of the Integer and TwoDigits rules is important, promote them to the parser and let the lexer only produce digits. That is, don't over-constrain the lexer.
In the parser, you can let the integer rule functionally hide the twoDigits rule except in the evaluation of the priorityStatement rule:
priorityStatement : T_PRIO twoDigits ;
integerStatement : T_INTEGER integer ;
integer: ZERO | ( DIGIT ( DIGIT | ZERO )* ) ;
twoDigits : DIGIT DIGIT ;
T_PRIO : 'PRIO' ;
T_INTEGER : 'INTEGER' ;
DIGIT : [1-9] ;
ZERO : '0' ;
I have d1="11" and d2="07". I want to convert d1 and d2 to integers and perform d1-d2. How do I do this in UNIX?
d1 - d2 currently returns "11-07" as result for me.
The standard solution:
expr $d1 - $d2
You can also do:
echo $(( d1 - d2 ))
but beware that this will treat 07 as an octal number! (so 07 is the same as 7, but 010 is different than 10).
Any of these will work from the shell command line. bc is probably your most straight forward solution though.
Using bc:
$ echo "$d1 - $d2" | bc
Using awk:
$ echo $d1 $d2 | awk '{print $1 - $2}'
Using perl:
$ perl -E "say $d1 - $d2"
Using Python:
$ python -c "print $d1 - $d2"
all return
4
An answer that is not limited to the OP's case
The title of the question leads people here, so I decided to answer that question for everyone else since the OP's described case was so limited.
TL;DR
I finally settled on writing a function.
If you want 0 in case of non-int:
int(){ printf '%d' ${1:-} 2>/dev/null || :; }
If you want [empty_string] in case of non-int:
int(){ expr 0 + ${1:-} 2>/dev/null||:; }
If you want find the first int or [empty_string]:
int(){ expr ${1:-} : '[^0-9]*\([0-9]*\)' 2>/dev/null||:; }
If you want find the first int or 0:
# This is a combination of numbers 1 and 2
int(){ expr ${1:-} : '[^0-9]*\([0-9]*\)' 2>/dev/null||:; }
If you want to get a non-zero status code on non-int, remove the ||: (aka or true) but leave the ;
Tests
# Wrapped in parens to call a subprocess and not `set` options in the main bash process
# In other words, you can literally copy-paste this code block into your shell to test
( set -eu;
tests=( 4 "5" "6foo" "bar7" "foo8.9bar" "baz" " " "" )
test(){ echo; type int; for test in "${tests[#]}"; do echo "got '$(int $test)' from '$test'"; done; echo "got '$(int)' with no argument"; }
int(){ printf '%d' ${1:-} 2>/dev/null||:; };
test
int(){ expr 0 + ${1:-} 2>/dev/null||:; }
test
int(){ expr ${1:-} : '[^0-9]*\([0-9]*\)' 2>/dev/null||:; }
test
int(){ printf '%d' $(expr ${1:-} : '[^0-9]*\([0-9]*\)' 2>/dev/null)||:; }
test
# unexpected inconsistent results from `bc`
int(){ bc<<<"${1:-}" 2>/dev/null||:; }
test
)
Test output
int is a function
int ()
{
printf '%d' ${1:-} 2> /dev/null || :
}
got '4' from '4'
got '5' from '5'
got '0' from '6foo'
got '0' from 'bar7'
got '0' from 'foo8.9bar'
got '0' from 'baz'
got '0' from ' '
got '0' from ''
got '0' with no argument
int is a function
int ()
{
expr 0 + ${1:-} 2> /dev/null || :
}
got '4' from '4'
got '5' from '5'
got '' from '6foo'
got '' from 'bar7'
got '' from 'foo8.9bar'
got '' from 'baz'
got '' from ' '
got '' from ''
got '' with no argument
int is a function
int ()
{
expr ${1:-} : '[^0-9]*\([0-9]*\)' 2> /dev/null || :
}
got '4' from '4'
got '5' from '5'
got '6' from '6foo'
got '7' from 'bar7'
got '8' from 'foo8.9bar'
got '' from 'baz'
got '' from ' '
got '' from ''
got '' with no argument
int is a function
int ()
{
printf '%d' $(expr ${1:-} : '[^0-9]*\([0-9]*\)' 2>/dev/null) || :
}
got '4' from '4'
got '5' from '5'
got '6' from '6foo'
got '7' from 'bar7'
got '8' from 'foo8.9bar'
got '0' from 'baz'
got '0' from ' '
got '0' from ''
got '0' with no argument
int is a function
int ()
{
bc <<< "${1:-}" 2> /dev/null || :
}
got '4' from '4'
got '5' from '5'
got '' from '6foo'
got '0' from 'bar7'
got '' from 'foo8.9bar'
got '0' from 'baz'
got '' from ' '
got '' from ''
got '' with no argument
Note
I got sent down this rabbit hole because the accepted answer is not compatible with set -o nounset (aka set -u)
# This works
$ ( number="3"; string="foo"; echo $((number)) $((string)); )
3 0
# This doesn't
$ ( set -u; number="3"; string="foo"; echo $((number)) $((string)); )
-bash: foo: unbound variable
let d=d1-d2;echo $d;
This should help.
Use this:
#include <stdlib.h>
#include <string.h>
int main()
{
const char *d1 = "11";
int d1int = atoi(d1);
printf("d1 = %d\n", d1);
return 0;
}
etc.