2016-10-12

Scala URL Parser Combinator

URL Parser Combinator

(updated 2016-10-16)
It has bugged me that there is no parser of an URL so I made one.

Parser Combinator

Scala provides an excellent tool of parsing using a mix of code in DSL (domain specific language) and regular expressions. Together with unapply it's easy to match and take out the matching pieces.

URL

From Wikipedia there are the following parts of an URL (some are optional):
scheme:[//[user:password@]host[:port]][/]path[?query][#fragment]

Code

Enjoy, I won't go into detail other than it breaks down the URL in the following order:
  1. Split URL by
    1. Mandatory scheme
    2. Optional domain
    3. Mandatory path
    4. Optional query and fragment
  2. Domain is split by
    1. Authorization
    2. Domain
    3. Port
Usually a greedy regexp matcher is used until a delimiter.
// https://github.com/scala/scala-parser-combinators
import scala.util.parsing.combinator.RegexParsers
case class UrlTokens(
scheme : String,
authorization : Option[(String, Option[String])],
domain : Option[String],
port : Option[Int],
path : String,
query : Option[String],
fragment : Option[String]) extends RegexParsers {
lazy val splitPath = HelperParser.splitPathBySlash(path)
lazy val splitQueryToValuePairs = query map (HelperParser.splitIntoValuePairs)
lazy val splitFragmentToValuePairs = fragment map (HelperParser.splitIntoValuePairs)
}
/**
* <p>
* From Wikipedia:
* <pre>
* scheme:[//[user:password@]host[:port]][/]path[?query][#fragment]
* </pre>
* </p>
*
* <p>
* Parsing an URL:
* </p>
* <ol>
* <li>Split by mandatory scheme, optional domain, mandatory path and then optional query and fragment</li>
* <li>If there's a domain split by optional authorization, mandatory domain and optional port</li>
* <ol>
* <p>
* Usually a parser is used by greedy regexp matching until a delimiter.
* </p>
*
* @see http://www.scala-lang.org/files/archive/api/2.11.2/scala-parser-combinators/#scala.util.parsing.combinator.RegexParsers
*/
class UrlParser extends RegexParsers {
def urlParser = scheme ~ opt(domain) ~ path ~ opt(query) ~ opt(fragment) ^^ {
case scheme ~ domain ~ path ~ query ~ fragment => domain match {
case Some((authorization, domain, port)) =>
Right(new UrlTokens(scheme, authorization, Some(domain), port, path, query, fragment))
case None =>
Right(new UrlTokens(scheme, None, None, None, path, query, fragment))
}
}
val notSlash = """[^\/]+""".r
val notDot = """[^\.]+""".r
val notColon = """[^:]+""".r
val notAt = """[^@]+""".r
val notColonOrSlash = """[^:\/]+""".r
val numbers = """\d+""".r
val notQuestionmarkOrHash = """[^\?#]+""".r
val notHash = """[^\#]*""".r
val any = """.*""".r
def scheme = notColon <~ ":"
def domain = "//" ~> opt(authorization <~ "@") ~ notColonOrSlash ~ opt(":" ~> port) <~ "/" ^^ {
case optionalAuthorization ~ domains ~ optionalPort => (optionalAuthorization, domains, optionalPort)
}
def authorization = notColon ~ opt(":" ~> notAt) ^^ { case user ~ optionalPassword => (user, optionalPassword) }
def port = numbers ^^ { case number => number.toInt }
def path = notQuestionmarkOrHash
def query = "?" ~> notHash
def fragment = "#" ~> any
def apply(url : String) = toResult(url, urlParser)
def applyWith(url : String, parser : Parser[_]) = toResult(url, parser)
def toResult(text : String, parser : Parser[_]) = {
parseAll(parser, text) match {
case Success(result, _) => Right(result)
case Failure(error, _) => Left(error)
case Error(error, _) => Left(error)
}
}
}
object UrlParser {
lazy val urlParser = new UrlParser
def apply(url : String) = urlParser(url)
}
/**
* Parse common patterns.
* <ul>
* <li>Split domain with dot</li>
* <li>Split path with slash</li>
* <li>Split query or fragment with ampersand and then create value pairs by split by equals</li>
* </ul>
*/
object HelperParser extends RegexParsers {
implicit def parserToOption[T](parserResult : ParseResult[T]) = parserResult match {
case Success(result, _) => Some(result)
case _ => None
}
val notDot = """[^\.]+""".r
def domainsByDot = repsep(notDot, ".")
def splitDomainByDot(domain : String) : Option[List[String]] = parseAll(domainsByDot, domain)
val notSlash = """[^\/]*""".r
def pathBySlash = repsep(notSlash, "/")
def splitPathBySlash(path : String) : Option[List[String]] = parseAll(pathBySlash, path)
val notEquals = """[^\=]+""".r
def valuePairs = repsep(pair, "&")
def pair = notEquals ~ "=" ~ opt(notEquals) ^^ {
case key ~ equal ~ optionalValue => (key, optionalValue)
}
/**
* Split by ambersand (&) and then equals (=)
*/
def splitIntoValuePairs(text : String) : Option[List[(String, Option[String])]] = parseAll(valuePairs, text)
}
view raw UrlParser.scala hosted with ❤ by GitHub

Inga kommentarer:

Skicka en kommentar