ChunkParsers

Mostly, this is a series of regular expressions built to find the next chunk in a markdown document. Some expressions, like headings, will determine the "real span" part separately from the expressions here.

All of the methods return a Chunk parser type, even when grouping the parsers together. To group things together, the foldedString will combine

// In com/tristanhunt/knockoff/ChunkParsers.scala
package com.tristanhunt.knockoff

import scala.util.parsing.combinator.RegexParsers

class ChunkParser extends RegexParsers with StringExtras {
    
  override def skipWhitespace = false
  
  def chunk : Parser[ Chunk ] = {
    horizontalRule | bulletItem | numberedItem | indentedChunk |
    header | blockquote | linkDefinition | textBlock | emptyLines
  }
  
  def emptyLines : Parser[ Chunk ] =
    rep1( emptyLine ) ^^ ( str => EmptySpace( foldedString( str ) ) )
  
  def emptyLine : Parser[ Chunk ] =
    """[\t ]*\n""".r ^^ ( str => EmptySpace( str ) )

  def textBlock : Parser[ Chunk ] =
    rep1( textLine ) ^^ ( str => TextChunk( foldedString( str ) ) )

  /** Match any line up until it ends with a newline. */
  def textLine : Parser[ Chunk ] =
    """[\t ]*\S[^\n]*\n?""".r ^^ ( str => TextChunk( str ) )
  
  def bulletItem : Parser[ Chunk ] =
    bulletLead ~ rep( trailingLine ) ^^ {
      case ~(lead, texts) => BulletLineChunk( foldedString( lead :: texts ) ) }
  
  /** Match a single line that is likely a bullet item. */
  def bulletLead : Parser[ Chunk ] =
    """[ ]{0,3}[*\-+](\t|[ ]{0,4})""".r ~> not("[*\\-+]".r) ~> textLine ^^ {
      textChunk => BulletLineChunk( textChunk.content ) }
  
  def numberedItem : Parser[ Chunk ] =
    numberedLead ~ rep( trailingLine ) ^^ {
      case ~(lead, texts) => NumberedLineChunk( foldedString( lead :: texts )) }
  
  def numberedLead : Parser[ Chunk ] =
    """[ ]{0,3}\d+\.(\t|[ ]{0,4})""".r ~> textLine ^^ {
      textChunk => NumberedLineChunk( textChunk.content ) }
  
  def trailingLine : Parser[ Chunk ] =
    """\t|[ ]{0,4}""".r ~> """[\S&&[^*\-+]&&[^\d]][^\n]*\n?""".r ^^ (
      s => TextChunk(s) )
  
  def header : Parser[ Chunk ] =
    ( setextHeaderEquals | setextHeaderDashes | atxHeader )

  def setextHeaderEquals : Parser[ Chunk ] =
    textLine <~ equalsLine ^^ ( s => HeaderChunk( 1, s.content.trim ) )

  def setextHeaderDashes : Parser[ Chunk ] =
    textLine <~ dashesLine ^^ ( s => HeaderChunk( 2, s.content.trim ) )

  def equalsLine : Parser[Any] = """=+\n""".r

  def dashesLine : Parser[Any] = """-+\n""".r

  def atxHeader : Parser[ Chunk ] =
    """#+ .*\n?""".r ^^ (
      s => HeaderChunk( s.countLeading('#'), s.trim('#').trim ) )
  
  def horizontalRule : Parser[ Chunk ] =
    """[ ]{0,3}[*\-_][\t ]?[*\-_][\t ]?[*\-_][\t *\-_]*\n""".r ^^ {
      s => HorizontalRuleChunk }
  
  def indentedChunk : Parser[ Chunk ] =
    rep1( indentedLine ) ^^ ( lines => IndentedChunk( foldedString( lines ) ) )
  
  def indentedLine : Parser[ Chunk ] =
    """\t|[ ]{4}""".r ~> ( textLine | emptyLine | emptyString )

  def emptyString : Parser[ Chunk ] = "".r ^^ ( s => EmptySpace(s) )
  
  def blockquote : Parser[ Chunk ] =
    blockquotedLine ~ rep( blockquotedLine | textLine ) ^^ {
      case ~(lead, trailing) =>
        BlockquotedChunk( foldedString( lead :: trailing ) ) }
  
  def blockquotedLine : Parser[ Chunk ] =
    """^>[\t ]?""".r ~> ( textLine | emptyLine )

  def linkDefinition : Parser[ Chunk ] =
    linkIDAndURL ~ opt( linkTitle ) <~ """[ ]*\n?""".r ^^ {
      case ~( idAndURL, titleOpt ) =>
        LinkDefinitionChunk( idAndURL._1, idAndURL._2, titleOpt ) }

  private def linkIDAndURL : Parser[ (String, String) ] =
    """[ ]{0,3}\[[^\[\]]*\]:[ ]+<?[\w\p{Punct}]+>?""".r ^^ { linkString =>
      val linkMatch = """^\[([^\[\]]+)\]:[ ]+<?([\w\p{Punct}]+)>?$""".r
                        .findFirstMatchIn( linkString.trim ).get;
      ( linkMatch.group(1), linkMatch.group(2) )
    }

  private def linkTitle : Parser[ String ] =
    """\s*""".r ~> """["'(].*["')]""".r ^^ (
      str => str.substring( 1, str.length - 1 ) )
  
  // Utility Methods
  
  /** Take a series of very similar chunks and group them. */
  private def foldedString( texts : List[ Chunk ] ) : String =
    ( "" /: texts )( (current, text) => current + text.content )
}

ChunkParsersSpec

// In test com/tristanhunt/knockoff/ChunkParsersSpec.scala
package com.tristanhunt.knockoff

import org.scalatest._
import org.scalatest.matchers._

class ChunkParsersSpec extends ChunkParser with Spec with ShouldMatchers {

  describe("ChunkParser") {
    it("should handle simple bullet items") {
      val src = "* item 1\n* item 2\n"
      parse( chunk, src ).get should equal ( BulletLineChunk("item 1\n") )
    }
    
    it("should group a second line that's not a bullet") {
      val src = "*   item 1\n    more\n"
      parse( chunk, src ).get should equal (
        BulletLineChunk("item 1\nmore\n")
      )
    }
    
    it("should ignore whitespace around headers") {
      val src = "# Header 1 #"
      parse( chunk, src ).get should equal { HeaderChunk(1, "Header 1") }
    }
    
    it("should be ok with empty code blocks") {
      val src = "    "
      parse( chunk, src ).get should equal { IndentedChunk("") }
    }
    
    it("should not explode on a code block with a trailing line") {
      val line = "    line\n    "
      parse( chunk, line ).get should equal { IndentedChunk("line\n") }
    }
    
    it("should handle nothin' but code") {
      val src = "    This is just a code block.\n" +
                "    \n" +
                "    And it has a trailing whitespace line... that's also indented.\n" +
                "    "
      parse( chunk, src ).get should equal { IndentedChunk(
        "This is just a code block.\n" +
        "\n" +
        "And it has a trailing whitespace line... that's also indented.\n"
      ) }
    }
  }
}