Textile to HTML

Textile is a markup language that is meant to make it quick and easy to write basic web pages for wikis, blogs and other uses. Redcloth is a ruby class that converts textile to html, written by _why. Details are at http://www.whytheluckystiff.net/ruby/redcloth/ . Unfortunately, due to my own failings, I find it hard to understand _why’s code, so I wrote a partial reimplementation bellow. This has the benefit of (for me) being clearer and easier to extend. The drawback is that it doesn’t do all that _why’s version does and is slower. But it is here in case anyone finds it useful.
require 'strscan'

# Note: Does not implement anything beyond that on http://hobix.com/textile/
# i.e. No markdown, link titles, and converting out of range ASCII
# AND IT IS MUCH SLOWER THAN REDCLOTH !!
#
# Bodged together in a few evenings by Tom Counsell (tamc2@cam.ac.uk)
# Feel free to do whatever you like with the code.
#
# Use is the same as for Redcloth (except that none of the options work):
# html = TextileParser.new( "textile string").to_html 
#
class TextileParser

    # These are applied first
    BLOCK_COMMANDS = [
        [ /<pre.*?>/, :pre_tag ], 
        [ /<code.*?>/, :code_tag ],
        [ //, :notextile_tag ],
        [ /h(\d)(.*?)\.\s+/i, :heading_block ],
        [ /bq(.*?)\.\s+/i, :quote_block ],
        [ /(\*+|\#+)\s+/,  :list_block ],
        [ /table(.*?)\.\s+/i, :table_block ],
        [ /\|/i, :unspecified_table_block ],
        [ /\{.+?\}\.\s+\|/i, :unspecified_table_block ], #�A styled table row        
        [ /p(.*?)\.\s+/i, :paragraph_block ],
        [ /fn(\d+)(.*?)\.\s+/i, :footnote_block ],
        [ /\S+/i, :unspecified_block ],
        [ /\s+/i, :skip ],
    ]

    # Then these
    COMMANDS = [
        [ //, :pre_tag ], # Can be inline as well as in a block
        [ //, :code_tag ],  # Can be inline as well as in a block
        [ //, :notextile_tag ],  # Can be inline as well as in a block
        [ /@(\S.*?\S)@/, :quick_escape_code ], 
        [ /"(.+?)":(\S*\w\/?)/, :textile_link ],
        [ /\[(.+?)\]:(\S*\w\/?)/, :textile_link_alias ],
        [ /!(.+?)!(\S*)/, :image_link ],
        [ /([A-Z0-9]+)\((.*?)\)/, :acronym ],
        [ /(\S+?)\[(\d+)\]/, :footnote ]
    ]    

    # Bit of a bodge, but need a different definition of phrase and whitespace in a table
    INLINE_COMMANDS = COMMANDS + [
        [ /\S+/i, :phrase ],
        [ /\s+/i, :space ],
    ]

    TABLE_INLINE_COMMANDS = COMMANDS + [
        [ /[^\s\t\r\n\f\|]+/i, :phrase ],
        [ /\s+/i, :space ],
    ]        

    # Then these are applied to phrases from above
    PHRASE_MODIFIERS = [ 
        [ '__', 'i' ],
        [ '**', 'b' ],
        [ '_', 'em' ],
        [ '*', 'strong' ],
        [ '??', 'cite' ],
        [ '-', 'del' ],
        [ '+', 'ins' ],
        [ '^', 'sup' ],
        [ '~', 'sub' ],
        [ '%', 'span' ], # How to avoid when people use % as in 3.0% growth?
        ].collect! { |regexp,tag| [ /^#{Regexp.escape(regexp)}/, /#{Regexp.escape(regexp)}$/, tag ] }

    # Character substitutions done last to any words
    GLYPHS = [
        [ /^\'/, '�' ], # single opening
        [ /^"/, '�' ], # double opening
        [ /\'$/, '�' ], # single closing
           [ /\"$/, '�' ], # double closing
        [ /\.{3}$/, '\1�' ], # ellipsis
        [ '--', '\1�' ], # em dash
        [ '->', ' ? ' ], # right arrow
        [ '-$', ' � ' ], # en dash

        [ '(TM)', '�' ], # trademark
        [ '(R)', '�' ], # registered
        [ '(C)', '�' ] # copyright
    ]

    # This is just used to give the ouput html a more beautiful layout
    # All tags in here will get a newline after they are output
    # and the indent for following lines increased by the number
    INDENTING_TAGS = {     'ul' => 1,
                        'ol' => 1,
                        'li' => 0,
                        'blockquote' => 1,
                        'table' => 1,
                        'tr' => 1,
                        'td' => 0,
                        'th' => 0,
                        'p' => 0,
                     }

    def initialize( text )
        @text = text
    end

    def to_html( settings = nil )
        reset
        convert_text
        return html.chomp.chomp # the tests don't have any trailing \ns
    end

    private

    ## Methods dealing with blocks of text are called first

    def convert_text
        until @scanner.eos?
            send( BLOCK_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
            add_to_html "\n" # Prettier html if extra space between blocks 
        end
        insert_any_link_aliases
        html
    end

    # These are all the block commands

    def paragraph_block
        tag( 'p', parse_attributes( @scanner[1] ) ) do 
            standard_paragraph
        end
    end

    def quote_block
        tag( "blockquote", parse_attributes( @scanner[1] ) ) do
            tag 'p' do
                standard_paragraph
            end
        end
    end

    def unspecified_block
        @scanner.unscan
        tag 'p' do
            standard_paragraph
        end
    end

    def heading_block
        tag( "h#{@scanner[1]}", parse_attributes( @scanner[2] ) ) do
            standard_line # Assume titles may only be on one line
        end 
    end

    def list_block
        ordered = list_ordered? # See what sort of list we have
        depth = list_depth
        @scanner.unscan # So that the lines can be scanned individually
        tag( ordered ? 'ol' : 'ul' ) do
            list_line( ordered, depth ) until end_of_list?( depth )
        end
    end

    def table_block
        tag( 'table', parse_attributes( @scanner[1] ) ) do
            table_line until end_of_paragraph?
        end
    end

    def unspecified_table_block
        @scanner.unscan
        tag( 'table' ) do
            table_line until end_of_paragraph?
        end
    end

    def footnote_block
        number = @scanner[1]
        attributes = parse_attributes( @scanner[2] )
        attributes[:id] = "fn#{number}" 
        tag 'p', attributes do
            add_to_html "#{number} " 
            standard_line until end_of_paragraph?
        end
    end

    # Now descend into methods dealing with lines of text

    def pre_tag
        escape_tag 'pre'
    end

    def code_tag
        escape_tag 'code'
    end

    def notextile_tag
        escape_tag 'notextile', false
    end

    # This escapes until a matching close tag
    def escape_tag( tag, include_tag_in_output = true )
        add_to_html( @scanner.matched ) if include_tag_in_output
        level = 1
        while level > 0
            break unless @scanner.scan(/(.*?)(<(\/)?#{tag}.*?>)/m) # Breaks if no closing tag
            add_to_html( htmlesc( @scanner[1] || "" ) )
            level = level + ( @scanner[3] ? -1 : 1 )
            add_to_html( htmlesc( @scanner[2] ) ) unless level  0
        end
        add_to_html "" if include_tag_in_output
    end

    def quick_escape_code
        tag 'code' do 
            add_to_html( htmlesc( @scanner[1] ) )
        end
    end

    def list_line( ordered, depth )
        tag 'li' do
            @scanner.scan(/(#+|\*+)\s+/)
            if ( list_ordered?  ordered ) && ( list_depth == depth )
                standard_line
            else # Recursive for sub lists
                list_block
            end
        end
    end

    def table_line
        # Are their row attributes at that start of the line?
        attributes = @scanner.scan(/(\{.+?\})\.\s+/) ? parse_attributes(@scanner[1]) : {}     
        @scanner.scan(/\|/) # Get rid of any leading cell opening
        tag( 'tr', attributes ) do 
            table_cell until end_of_table_line?
        end
    end    

    def standard_paragraph
        until end_of_paragraph?    
            send( INLINE_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
        end
    end

    def standard_line
        until end_of_line?
            send( INLINE_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
        end
    end

    # Now descend into methods dealing with phrases

    def table_cell
        # Style defined at start of cell ?
        attributes = @scanner.scan(/(_)?(\S*?)\.\s*/) ?  parse_attributes(@scanner[2]) : {}
        tag( @scanner[1] ? 'th' : 'td', attributes) do 
            until end_of_table_cell?
                send( TABLE_INLINE_COMMANDS.detect { |regexp, method| @scanner.scan( regexp ) }[1] )
            end    
        end
    end

    def footnote
        add_to_html "#{@scanner[1]}#{@scanner[2]}" 
    end

    def acronym
        add_to_html "#{@scanner[1]}" 
    end

    def phrase
        word = @scanner.matched

        return add_to_html( parse_glyphs( word ) ) unless word =~ /\w+/ # If a word is entirely symbols then we will leave it in peace.

        # Open tags
        PHRASE_MODIFIERS.each do |start_r, end_r, tag|
            if word =~ start_r
                word = $' # The bit after the match
                # Look for matching brackets that indicate there are attributes
                if word =~ /(\(.+?\)|\{.+?\}|\[.+?\])/                 
                    open_tag( tag, parse_attributes( $1 ) )
                    word = $'
                else
                    open_tag tag
                end
                break
            end
        end

        # Close tags
        end_tag = nil
        PHRASE_MODIFIERS.each do |start_r, end_r, tag|
            if word =~ end_r
                end_tag = tag
                word = $` # The bit before the match
                break
            end
        end
        add_to_html parse_glyphs( word )
        close_tag( end_tag ) if end_tag
    end

    def space
        add_to_html @scanner.matched
    end

    def image_link
        @scanner.matched =~ /^!([<>]*)(.*?)(!|\((.*?)\)!)($|(:(.+?)$))/
        alignment, src, title, url = $1, $2, $4, $7
        attributes = {}
        attributes[:style] = 'float:right' if alignment  '>'
        attributes[:style] = 'float:left' if alignment  '<'

        attributes[:src] = src
        attributes[:alt] = attributes[ :title ] = title if title
        if url
            tag 'a', { :href => url } do
                open_tag 'img', attributes, true
            end
        else
            open_tag 'img', attributes, true
        end    
    end

    def skip
        # Do nothing !
    end

    def textile_link
        add_to_html "#{@scanner[1]}" 
    end

    def textile_link_alias
        # These are saved for later resubstitution
        @aliases[ @scanner[1] ] = @scanner[2]
    end

    # These feels clunky, and is done last
    def insert_any_link_aliases
        @aliases.each do |als, href|
            html.gsub!( /href="#{als}"/, "href=\"#{href}\"" )
        end
    end

    # These are helper methods that make sure html is properly closed and indented

    def tag( tag, attributes = {} )
        open_tag( tag, attributes )
        yield
        close_tag tag
    end

    def close_tag( tag = :all_tags )
        # Check the tag has been opened
        return unless open_tags.include?( tag ) || ( tag == :all_tags )

        # Close all tags up to that tag (in case one was not closed)
        until open_tags.empty?
            open_tag = open_tags.pop
            # This is just stuff to make the html look pretty
            if (indent = INDENTING_TAGS[ open_tag ] ) 
                if indent == 0
                    add_to_html "" 
                    add_to_html "\n" 
                else
                    add_to_html "\n" unless html =~ /\n$/
                    @indent -= indent
                    add_to_html( "\t" * @indent )
                    add_to_html "" 
                    add_to_html "\n" 
                end
            else
                add_to_html "" 
            end
            return if open_tag == tag
        end
    end

    def open_tag( tag, attributes = {}, no_close_tag = false )
        add_to_html( "\t" * @indent )

        add_to_html "<#{tag}" 

        attributes.each { |key, value|     add_to_html( " #{key.to_s}=\"#{value.to_s}\"" )    }

        if no_close_tag
            add_to_html " />" 
            return
        end

        add_to_html ">" 

        if (indent = INDENTING_TAGS[ tag ] )            
            add_to_html "\n" unless indent == 0
            @indent += indent
        end
        open_tags << tag
    end

    def open_tags
        @open_tags ||= []
    end

    def parse_attributes( attribute_text )
        return {} unless attribute_text && attribute_text != "" 
        a = { :style => "" }

        # The hand-entered classes, ids, styles and langs
        # These are replaced with "" so their content cannot be matched below
        a[:lang] = $1 if attribute_text =~ /\[(.+?)\]/
        a[:class] = $1 if attribute_text.sub!(/\((.+?)\)/,'')
        a[:class], a[:id] = $1, $2 if a[:class] =~  /^(.*?)#(.*)$/
        a[:style] << "#{$1};" if attribute_text.sub!(/\{(.+?)\}/,'')

        # Various padding and indents
        a[:style] << "padding-left:#{ $1.length }em;" if attribute_text =~ /(\(+)/
        a[:style] << "padding-right:#{ $1.length }em;" if attribute_text =~ /(\)+)/        

        # The various alignments
        a[:style] << "text-align:left;" if attribute_text =~ /<(?!>)/
        a[:style] << "text-align:right;" if attribute_text =~ /(?!<)>/
        a[:style] << "text-align:justify;" if attribute_text =~ /<>/
        a[:style] << "text-align:center;" if attribute_text =~ /=/

        #Various column spans on tables
        a[:colspan] = $1 if attribute_text =~ /\\(\d+)/
        a[:rowspan] = $1 if attribute_text =~ /\/(\d+)/

        #Vertical alignments on tables
        a[:style] << "vertical-align:top;" if attribute_text =~ /\^/
        a[:style] << "vertical-align:bottom;" if attribute_text =~ /\~/

        # Get rid of any empty attributes before returning
        a.delete_if { |k,v| !v || (v  "") }
    end

    def parse_glyphs( word )
        GLYPHS.each do |regexp,replacement|
            word.gsub!( regexp, replacement )
        end
        word
    end

    # Now some helper methods for spotting the ends of sections

    def end_of_paragraph?
        return true if @scanner.eos?
        @scanner.scan(/\n{2,}/)
    end

    def end_of_list?( depth )
        return true if @scanner.eos?
        return true unless @scanner.check(/(#+|\*+)\s+/) # Not a list any more
        return true if list_depth < depth # End of this sub list
        @scanner.scan(/\n{2,}/)
    end

    def end_of_line?
        return true if @scanner.eos?
        return true if @scanner.check(/\n{2,}/)
        @scanner.scan(/\n/)
    end

    def end_of_table_line?
        return true if @scanner.eos?
        return true if @scanner.check(/\n{2,}/)
        @scanner.scan(/\|\s*\n/)
    end

    def end_of_table_cell?
        return true if @scanner.eos?
        return true if @scanner.check(/\n/)
        return true if @scanner.check(/\|\s*\n/)
        @scanner.scan(/\|/)
    end

    # Now some random helper methods for decoding

    def list_ordered?
        @scanner.matched[0,1]  '#'
    end

    def list_depth
        @scanner[1].size
    end

    def htmlesc( str )
        str.gsub!( '&', '&' )
        str.gsub!( '"', '"' )
        str.gsub!( '<', '<')
        str.gsub!( '>', '>')
        str
    end

    # Now the low level matching functions

    def add_to_html( object )
        @html << object.to_s
    end

    def html
        @html
    end

    def reset
        @html, @scanner = "" , StringScanner.new( @text )
        @aliases = {}
        @indent = 0
    end    
end