Modify/rewrite ErrorToken String in scala - scala

I'm new in scala. I have a problem in string literal. Here's my code:
import scala.util.matching.Regex
import scala.util.parsing.combinator.lexical.StdLexical
import scala.util.parsing.combinator.token.StdTokens
import scala.util.parsing.input.CharArrayReader.EofCh
trait SimpleTokens extends StdTokens {
// Adapted from StdTokens
case class FloatLit(chars: String) extends Token {
override def toString = "FloatLit "+chars
}
case class IntLit(chars: String) extends Token {
override def toString = "IntLit "+chars
}
case class BooleanLit(chars: String) extends Token {
override def toString = "BooleanLit " + chars
}
case class StrLit(chars: String) extends Token {
override def toString = "\"" + chars.slice(1,chars.length-1) + "\""
}
}
class SimpleLexer extends StdLexical with SimpleTokens {
import scala.util.parsing.input.CharArrayReader.EofCh
reserved ++= List( "mod", "div", "array","if","then")
delimiters ++= List( ";", "(", ")", "+", "-", "*", "/",".")
def regex(r: Regex): Parser[String] = new Parser[String] {
def apply(in: Input) = {
val source = in.source
val offset = in.offset
(r findPrefixMatchOf (source.subSequence(offset, source.length))) match {
case Some(matched) =>
Success(source.subSequence(offset, offset + matched.end).toString,
in.drop(matched.end))
case None =>
Failure("string matching regex `" + r + "' expected but `" + in.first + "' found", in.drop(0))
}
}
}
override def token: Parser[Token] = {
// Adapted from StdLexical
(
regex("true|false".r) ^^ { BooleanLit(_)}
|regex("[a-z][a-z]*".r) ^^ { processIdent(_) }
|regex("""([0-9]*)(((\.)?[0-9]+(e|E)(\+|-)?[0-9]+)|(\.[0-9]+))""".r) ^^ { FloatLit(_) }
|regex("\\d+".r) ^^ { IntLit(_) }
|regex("""'([^'\"]|'')*'""".r) ^^ {StrLit(_) }
|EofCh ^^^ EOF
|delim
)
}
override def whitespace: Parser[Any] = rep(
whitespaceChar
| '/' ~ '*' ~ comment
| '/' ~ '*' ~> failure("unclosed comment"))
override protected def comment: Parser[Any] = (
'*' ~ '/' ^^ { case _ => ' ' }
| chrExcept(EofCh) ~ comment)
}
Input is: 'this is string
Output should be: ErrorToken(Unclosed string: 'This is string)
But when I run, I receive this:
ErrorToken '
identifier this
identifier is
identifier string
What do I have to do to get the correct Output? Please help me!!! Thanks for considering my problem

You can modify on Regex:
|regex("""(')[^']*(\t)""".r) ^^ ("Illegal tab in string:"+_) ^^ ErrorToken
|regex("""(')[^']*""".r) ^^ ("Unclosed string:"+_) ^^ ErrorToken
PPL-2012?

Related

Matching braces code doesn't terminate

I have this code below to check a string. We want to verify that it starts with '{' and ends with '}' and that it contains sequences of non-"{}" characters and strings that also have this property.
import util.parsing.combinator._
class Comp extends RegexParsers with PackratParsers {
lazy val bracefree: PackratParser[String] = """[^{}]*""".r ^^ {
case a => a
}
lazy val matching: PackratParser[String] = (
"{" ~ rep(bracefree | matching) ~ "}") ^^ {
case a ~ b ~ c => a + b.mkString("") + c
}
}
object Brackets extends Comp {
def main(args: Array[String])= {
println(parseAll(matching, "{ foo {hello 3 } {}}").get)
}
}
The desired output for this is to echo { foo {hello 3 } {}}, but it ends up taking a long time before dying from java.lang.OutOfMemoryError: GC overhead limit exceeded. What am I doing wrong and what should I have done instead?
Your regular expression for bracefree string matches even an empty string, so parser produced by rep() succeeds without consuming any input and will loop endlessly.
Use a + quantifier instead of *:
lazy val bracefree: PackratParser[String] = """[^{}]+""".r ^^ {
case a => a
}
Also, by default RegexParsers will skip empty strings and whitespaces. To turn that behavior off, just override method skipWhitespace to always return false. In the end your parser will look like this:
import util.parsing.combinator._
class Comp extends RegexParsers with PackratParsers {
override def skipWhitespace = false
lazy val bracefree: PackratParser[String] = """[^{}]+""".r ^^ {
case a => a
}
lazy val matching: PackratParser[String] = (
"{" ~ rep(bracefree | matching) ~ "}") ^^ {
case a ~ b ~ c => a + b.mkString("") + c
}
}
object Brackets extends Comp {
def main(args: Array[String])= {
println(parseAll(matching, "{ foo {hello 3 } {}}").get)
// prints: { foo {hello 3 } {}}
}
}

scala parser combinator infinite loop

I'm trying to write a simple parser in scala but when I add a repeated token Scala seems to get stuck in an infinite loop.
I have 2 parse methods below. One uses rep(). The non repetitive version works as expected (not what I want though) using the rep() version results in an infinite loop.
EDIT:
This was a learning example where I tired to enforce the '=' was surrounded by whitespace.
If it is helpful this is my actual test file:
a = 1
b = 2
c = 1 2 3
I was able to parse: (with the parse1 method)
K = V
but then ran into this problem when tried to expand the exercise out to:
K = V1 V2 V3
import scala.util.parsing.combinator._
import scala.io.Source.fromFile
class MyParser extends RegexParsers {
override def skipWhitespace(): Boolean = { false }
def key: Parser[String] = """[a-zA-Z]+""".r ^^ { _.toString }
def eq: Parser[String] = """\s+=\s+""".r ^^ { _.toString.trim }
def string: Parser[String] = """[^ \t\n]*""".r ^^ { _.toString.trim }
def value: Parser[List[String]] = rep(string)
def foo(key: String, value: String): Boolean = {
println(key + " = " + value)
true
}
def parse1: Parser[Boolean] = key ~ eq ~ string ^^ { case k ~ eq ~ string => foo(k, string) }
def parse2: Parser[Boolean] = key ~ eq ~ value ^^ { case k ~ eq ~ value => foo(k, value.toString) }
def parseLine(line: String): Boolean = {
parse(parse2, line) match {
case Success(matched, _) => true
case Failure(msg, _) => false
case Error(msg, _) => false
}
}
}
object TestParser {
def usage() = {
System.out.println("<file>")
}
def main(args: Array[String]) : Unit = {
if (args.length != 1) {
usage()
} else {
val mp = new MyParser()
fromFile(args(0)).getLines().foreach { mp.parseLine }
println("done")
}
}
}
Next time, please provide some concrete examples, it's not obvious what your input is supposed to look like.
Meanwhile, you can try this, maybe you find it helpful:
import scala.util.parsing.combinator._
import scala.io.Source.fromFile
class MyParser extends JavaTokenParsers {
// override def skipWhitespace(): Boolean = { false }
def key: Parser[String] = """[a-zA-Z]+""".r ^^ { _.toString }
def eq: Parser[String] = "="
def string: Parser[String] = """[^ \t\n]+""".r
def value: Parser[List[String]] = rep(string)
def foo(key: String, value: String): Boolean = {
println(key + " = " + value)
true
}
def parse1: Parser[Boolean] = key ~ eq ~ string ^^ { case k ~ eq ~ string => foo(k, string) }
def parse2: Parser[Boolean] = key ~ eq ~ value ^^ { case k ~ eq ~ value => foo(k, value.toString) }
def parseLine(line: String): Boolean = {
parseAll(parse2, line) match {
case Success(matched, _) => true
case Failure(msg, _) => false
case Error(msg, _) => false
}
}
}
val mp = new MyParser()
for (line <- List("hey = hou", "hello = world ppl", "foo = bar baz blup")) {
println(mp.parseLine(line))
}
Explanation:
JavaTokenParsers and RegexParsers treat white space differently.
The JavaTokenParsers handles the white space for you, it's not specific for Java, it works for most non-esoteric languages. As long as you are not trying to parse Whitespace, JavaTokenParsers is a good starting point.
Your string definition included a *, which caused the infinite recursion.
Your eq definition included something that messed with the empty space handling (don't do this unless it's really necessary).
Furthermore, if you want to parse the whole line, you must call parseAll,
otherwise it parses only the beginning of the string in non-greedy manner.
Final remark: for parsing key-value pairs line by line, some String.split and
String.trim would be completely sufficient. Scala Parser Combinators are a little overkill for that.
PS: Hmm... Did you want to allow =-signs in your key-names? Then my version would not work here, because it does not enforce an empty space after the key-name.
This is not a duplicate, it's a different version with RegexParsers that takes care of whitespace explicitly
If you for some reason really care about the white space, then you could stick to the RegexParsers, and do the following (notice the skipWhitespace = false, explicit parser for whitespace ws, the two ws with squiglies around the equality sign, and the repsep with explicitly specified ws):
import scala.util.parsing.combinator._
import scala.io.Source.fromFile
class MyParser extends RegexParsers {
override def skipWhitespace(): Boolean = false
def ws: Parser[String] = "[ \t]+".r
def key: Parser[String] = """[a-zA-Z]+""".r ^^ { _.toString }
def eq: Parser[String] = ws ~> """=""" <~ ws
def string: Parser[String] = """[^ \t\n]+""".r
def value: Parser[List[String]] = repsep(string, ws)
def foo(key: String, value: String): Boolean = {
print(key + " = " + value)
true
}
def parse1: Parser[Boolean] = (key ~ eq ~ string) ^^ { case k ~ e ~ v => foo(k, v) }
def parse2: Parser[Boolean] = (key ~ eq ~ value) ^^ { case k ~ e ~ v => foo(k, v.toString) }
def parseLine(line: String): Boolean = {
parseAll(parse2, line) match {
case Success(matched, _) => true
case Failure(msg, _) => false
case Error(msg, _) => false
}
}
}
val mp = new MyParser()
for (line <- List("hey = hou", "hello = world ppl", "foo = bar baz blup", "foo= bar baz", "foo =bar baz")) {
println(" (Matches: " + mp.parseLine(line) + ")")
}
Now the parser rejects the lines where there is no whitespace around the equal sign:
hey = List(hou) (Matches: true)
hello = List(world, ppl) (Matches: true)
foo = List(bar, baz, blup) (Matches: true)
(Matches: false)
(Matches: false)
The bug with * instead of + in string has been removed, just like in the previous version.

recognizing eol in scala parser combinators

I'm trying to make a very simple parser with parser combinators (to parse something similar to BNF). I've checked several blog posts that explain the matter (the ones top-ranked at Google (for me)) and I think I understand it but the tests say otherwise.
I've checked the questions in StackOverflow and while some could maybe be applied and useful whenever I try to apply them something else breaks, so best way to to is going through an specific example:
This is my main:
def main(args: Array[String]) {
val parser: BaseParser = new BaseParser
val eol = sys.props("line.separator")
val test = s"a = b ${eol} a = c ${eol}"
System.out.println(test)
parser.parse(test)
}
This is the parser:
import com.github.trylks.tests.parser.ParserClasses._
import scala.util.parsing.combinator.syntactical._
import scala.util.parsing.combinator.ImplicitConversions
import scala.util.parsing.combinator.PackratParsers
class BaseParser extends StandardTokenParsers with ImplicitConversions with PackratParsers {
val eol = sys.props("line.separator")
lexical.delimiters += ("=", "|", "*", "[", "]", "(", ")", ";", eol)
def rules = rep1sep(rule, eol) ^^ { Rules(_) }
def rule = id ~ "=" ~ repsep(expression, "|") ^^ flatten3 { (e1: ID, _: Any, e3: List[Expression]) => Rule(e1, e3) }
def expression: Parser[Expression] = (element | parenthesized | optional) ^^ { x => x } // and sequence and repetition, but that's another problem...
def parenthesized: Parser[Expression] = "(" ~> expression <~ ")" ^^ { x => x }
def optional: Parser[Expression] = "[" ~> expression <~ "]" ^^ { Optional(_) }
def element: Parser[Element] = (id | constant) ^^ { x => x }
def constant: Parser[Constant] = stringLit ^^ { Constant(_) }
def id: Parser[ID] = ident ^^ { ID(_) }
def parse(text: String): Option[Rules] = {
val s = rules(new lexical.Scanner(text))
s match {
case Success(res, next) => {
println("Success!\n" + res.toString)
Some(res)
}
case Error(msg, next) => {
println("error: " + msg)
None
}
case Failure(msg, next) => {
println("failure: " + msg)
None
}
}
}
}
These are the classes that you are missing from the previous part of the code:
object ParserClasses {
abstract class Element extends Expression
case class ID(value: String) extends Element {
override def toString(): String = value
}
case class Constant(value: String) extends Element {
override def toString(): String = value
}
abstract class Expression
case class Optional(value: Expression) extends Expression {
override def toString() = s"[$value]"
}
case class Rule(head: ID, body: List[Expression]) {
override def toString() = s"$head = ${body.mkString(" | ")}"
}
case class Rules(rules: List[Rule]) {
override def toString() = rules.mkString("\n")
}
}
The problem is: as the code is now, it doesn't work, it parses only one rule (not both). If I replace eol with ";" (in the main and the parser) then it works (at least for this test).
Most people seem to prefer regex parsers, every blog explaining parser combinators doesn't get into details about the traits that could be extended or not, so I have no idea about those differences or why there are several (I say this because it may be important to understand why the code doesn't work). The problem is: If I try to use regex parsers then I get errors for all the strings that I have specified in the parsers "=", "*", etc.

find unclose string in scala (lexical analysis)

I am developing lexical analysis for my program language. I want to produce the fail string which have open quote but dont have close quote. Ex: "hello
class SimpleLexer extends StdLexical {
import scala.util.parsing.input.CharArrayReader.EofCh
def regex(r: Regex): Parser[String] = new Parser[String] {
def apply(in: Input) = {
val source = in.source
val offset = in.offset
(r findPrefixMatchOf (source.subSequence(offset, source.length))) match {
case Some(matched) =>
Success(source.subSequence(offset, offset + matched.end).toString,
in.drop(matched.end))
case None =>
Failure("string matching regex `" + r + "' expected but `" + in.first + "' found", in.drop(0))
}
}
}
override def token: Parser[Token] = {
// Adapted from StdLexical
(
'\"' ~ rep( chrExcept('\"', '\n','\t','\b','\f','\r', EofCh) ) ~ '\"' ^^ { case '\"' ~ chars ~ '\"' => StringLit(chars mkString "") }
|'\"' ~> failure("Unclosed string: "+"??") // I want produce fail string
|EofCh ^^^ EOF
|delim
)
}
override def whitespace: Parser[Any] = rep(
whitespaceChar
| '/' ~ '*' ~ comment
| '/' ~ '*' ~> failure("unclosed comment"))
override protected def comment: Parser[Any] = (
'*' ~ '/' ^^ { case _ => ' ' }
| chrExcept(EofCh) ~ comment)
}
Excample:
input: " safs i
output: ErrorToken(Unclosed string: " safs i)
Can you help me solve this problem.
Thanks.
My answer for your question
override def token: Parser[Token] = {
// Adapted from StdLexical
(
'\"' ~ rep( chrExcept('\"', '\n','\t','\b','\f','\r', EofCh) ) ~ '\"' ^^ { case '\"' ~ chars ~ '\"' => StringLit(chars mkString "") }
|'\"' ~> rep( chrExcept('\"', '\n','\t','\b','\f','\r', EofCh) ) ^^ {chars => ErrorToken(("\"" :: chars) mkString "")}
|EofCh ^^^ EOF
|delim
)
}

Scala RegexParsers

I am using the following object to parse csv. The parser seems to be working correctly except spaces are being stripped out. Could someone help me figure out where that is happening. Thanks
object CsvParser extends RegexParsers {
override protected val whiteSpace = """[ \t]""".r
def COMMA = ","
def DQUOTE = "\""
def DQUOTE2 = "\"\"" ^^ { case _ => "\"" }
def CR = "\r"
def LF = "\n"
def CRLF = "\r\n"
def TXT = "[^\",\r\n]".r
def record: Parser[List[String]] = rep1sep(field, COMMA)
def field: Parser[String] = (escaped | nonescaped)
def escaped: Parser[String] = (DQUOTE ~> ((TXT | COMMA | CR | LF | DQUOTE2)*) <~ DQUOTE) ^^ { case ls => ls.mkString("") }
def nonescaped: Parser[String] = (TXT*) ^^ { case ls => ls.mkString("") }
def parse(s: String) = parseAll(record, s) match {
case Success(res, _) => res
case _ => List[List[String]]()
}
}
I think I figured it out. Needed to add:
override val skipWhitespace = false