#!/usr/local/bin/perl -T # # textweb.pl -- Minimal text-to-html markup processor. # # This program has been donated to the Public Domain by Thatcher # Ulrich, http://tulrich.com . Do whatever you want with it. # # The line below starts a block comment. my $COMMENT = <' by itself (not adjacent to any words or slashes), then Textweb will assume you meant it to show up in the document, and will escape it. * Do *bold* _underline_ /italic/ like you would in text email. * `` and '' get turned into fancy quote marks. * Bulleted lists (like this one) get turned into HTML unordered lists * Numbered lists get turned into HTML
    (@@ not yet!) * Line-break separated paragraphs get denoted in HTML * URL's get turned into links. * Any line that starts with exactly four dashes "----" toggles plain text mode. All text through the next line starting in four dashes is rendered as plain text. Use this to quote code, for example. * Any line that start with a single space followed by four dashes also toggles plain text mode, but does not output the current line. Use this to "silently" exit or enter plain text mode. * Any line that consists of nothing but five or more dashes ("-----") gets turned into a horizontal bar. * Things that look like section headings (underlined with '=' characters) get rendered as big bold headings (

    ). * Things that look like minor section headings (underlined with '-' characters) get rendered as smaller bold headings (

    ). * Special directives: * \include{filename.txt} -- include the specified file (and apply processing) * \title{some text} -- set the page title * \head{some html} -- insert some stuff into the page's tag (e.g. CSS styles) * \code{some literal text} * \comment{some comment text that gets filtered out} * \atom_author{My Name} -- use this to declare the file as a blog, and set the author for exporting an Atom feed. * \\text... -- don't process this text. * Blog instructions: write your blog like this: [.... misc header formatting ...] \atom_author{Your Name Here} ------------------------------------------------ 15 Jan 2002 *I'm Feeling Funny* I ate something last night, and now I feel funny -- was it the raw clams, or the fried caterpillars? ------------------------------------------------ And so on -- you separate your posts with horizontal lines. To get an Atom feed of your blog, link to "http://yoursite.org/yourpage.xml" TODO: * Clean it up; it's a mess * Better ability to protect text from markup (e.g. code snippets), especially inline. * TWiki-style [[url][description]] tags are handy, but don't look good or intuitive in the source .txt. Come up with something better. * Idea: put hidden URLs in parentheses. The link text is the previous word. Could also insert an explicit [link] word or something. * Make different weight headings by looking at whether '===' or '---' is used to underline the heading text. * Something clever for tables? Would be nice to recognize fixed-pitch ASCII-art tables and turn 'em into HTML. * Ability to pull source document from any URL. Use this to view (and test) random .txt documents on the web. Send comments, flames, kudos and bug fixes to me, tu\@tulrich.com . END_OF_COMMENT use strict; use warnings; use CGI; use Cwd; # Helper; re-extend a filename with a new given extension. sub reextend { my ($fn, $new_ext) = @_; if ($fn =~ /^(.*)\.txt$/) { return $1 . $new_ext; } return $fn; } # Helper; escape html so it's suitable for embedding in Atom text fields. sub escape_html { my ($text) = @_; # Escape all '&' chars $text =~ s/\&/\&/g; # Escape all '<' chars $text =~ s/\ 12) { return ''; } if ($day > 31) { return ''; } if ($year < 100) { $year += 2000; # 2 digit year! } return sprintf("%04d-%02d-%02dT00:00:00Z", $year, $month, $day); } sub get_google_friendconnect_comment_div { my ($entry_id) = @_; my $text = <
    END_OF_TEMPLATE ; return $text; } my $q = new CGI; if ($q->param('show_source')) { # # Show script source. # print "Content-type: text/plain\n\n"; open IN, "textweb.pl"; print ; close IN; exit(0); } my $path = $q->param('path') || 'index.txt'; # Don't allow ".."'s in the path. while ($path =~ s/\.\.//) {} # Don't allow leading slash. while ($path =~ s/^\///) {} my $base = getcwd(); my $file = "$base/$path"; my %page_contents; # # Only serve .txt files, and make sure the page exists. # if ((! $file =~ /\.txt$/) || (! -f $file)) { print "Content-type: text/html\n\n"; print "404 Not Found\n"; print "

    Not Found

    \n"; print "The requested URL $path was not found on this server.

    \n"; print "\n"; exit(1); } # # Process the page's contents. # $page_contents{'body'} = ''; $page_contents{'title'} = "textweb"; $page_contents{'head'} = ''; $page_contents{'bodyheader'} = ''; $page_contents{'requested_format'} = "html"; if ($q->param('format') && $q->param('format') eq "atom") { $page_contents{'requested_format'} = "atom"; } $page_contents{'format'} = $page_contents{'requested_format'}; $page_contents{'entries'} = []; $page_contents{'selflink_atom'} = $q->url(-base=>1) . "/" . reextend($path, ".xml"); $page_contents{'selflink_html'} = $q->url(-base=>1) . "/" . reextend($path, ".html"); process_page(\%page_contents, $path, $base); # # Output the content. # if ($page_contents{'format'} eq "atom") { # ATOM feed output. my $entries = $page_contents{'entries'}; print "Content-type: application/atom+xml\n\n"; # print "Content-type: text/html\n\n"; print "\n"; print "\n"; print " " . $page_contents{'title'} . "\n"; print " " . $page_contents{'atom_author'} . "\n"; print " " . $page_contents{'selflink_atom'} . "\n"; print " " . $$entries[0]->{'updated'} . "\n"; print " \n"; print " \n"; for my $entry (@{$entries}) { print " \n"; my $this_id = $page_contents{'selflink_atom'} . "#d" . $entry->{'updated'}; my $this_permalink = $page_contents{'selflink_html'} . "#d" . $entry->{'updated'}; print " " . $entry->{'title'} . "\n"; print " " . $this_id . "\n"; print " \n"; print " " . $entry->{'updated'} . "\n"; print " \n"; print $entry->{'content'}; print " \n"; print " \n"; } # iterate over the entries... # # ... # # urn:junk.... # 2003-12-13T18:30:02Z # Some text. # print "\n"; } else { # Normal HTML output. print "Content-type: text/html\n\n"; print "" . $page_contents{'title'} . "\n"; print $page_contents{'head'}; print "\n"; print ""; print $page_contents{'body'}; print "\n"; } exit(0); # # End of main script # sub process_page # # Given a parser context, a .txt filename, and the current directory, # read the file and generate HTML. Put the HTML output in # $context->{'body'}. # # If $context->{'requested_format'} is "atom" and the page contains # blog-like entries, then put a reference to a list of entries in # $contenxt->{'entries'}. Each entry is a hash, with keys "title", # "date", and "content". # # Entries begin and end with horizontal rules. The title & date are # inferred from lines at the beginning of an entry that look like # title or date. # { my ($contents, $file, $dir) = @_; # Parser state variables. my $para = 0; # 0 --> not in a paragraph, 1 --> in a paragraph my $para_text = ''; # current paragraph my $bullet_list = 0; my $bullet_item = 0; my $plain_text = 0; my $markup_on = 1; my $in_entry = 0; # 0 --> not in an entry, 1 --> in an entry my $entry_id = ''; my $allow_comments = 0; my $outbuf = ''; my $is_feed = 0; my $last_title = ''; my $last_date = ''; my $in = newopen($file) or die "can't open $file $!"; my $line; my $start_feed = sub { $is_feed = 1; $contents->{'format'} = "atom"; $contents->{'atom_id'} = ""; # URL + file? if (!exists($contents->{'atom_author'})) { $contents->{'atom_author'} = ''; } }; my $start_entry = sub { $last_date = ''; $last_title = ''; $outbuf = ''; }; my $process_date_line = sub { my ($line) = @_; # Try "13 Jan 2005" format. if ($line =~ /^(\d+)(-|\s)([\w]{3,})(-|\s)(\d{2,4})$/) { my $day = $1 + 0; my $year = $5 + 0; my $month = decode_month($3); my $date = encode_date($year, $month, $day); if ($date ne '') { # Date looked valid. $last_date = $date; return 1; } } # Add other formats here... return 0; }; my $process_title_line = sub { my ($line) = @_; if ($last_title eq '') { # First non-date text line is assumed to be the post title. $last_title = escape_html(process_paragraph_text($line)); return 1; } return 0; }; my $emit_entry_header = sub { my ($entry_id) = @_; # In HTML, we produce a permalink anchor from the date: $outbuf .= "\n"; $outbuf .= ""; my $permalink = $contents->{'selflink_html'} . "#" . $entry_id; $outbuf .= ""; }; my $emit_entry_footer = sub { my ($entry_id) = @_; my $comment_id = $contents->{'selflink_atom'} . "#$entry_id"; if ($allow_comments == 1) { $outbuf .= "\n"; $outbuf .= get_google_friendconnect_comment_div($comment_id); $outbuf .= "\n"; } }; while ($line = <$in>) { if ($plain_text == 1) { # # Plain text mode. # if ($line =~ /^----([^-]+|$)/) { # Exit plain text mode. $outbuf .= process_plain_text($line) . "\n"; $plain_text = 0; } elsif ($line =~ /^ ----([^-]+|$)/) { # *Silently* exit plain text mode. $outbuf .= "\n"; $plain_text = 0; } else { $outbuf .= process_plain_text($line); } } elsif ($line =~ /\\([a-z_]+)\{(.*)\}/) { # # Handle \directive{arg} # my $directive = $1; my $arg = $2; if ($directive eq 'title') { $contents->{'title'} = $arg; } elsif ($directive eq 'head') { # Insert some junk into the tag. $contents->{'head'} .= $arg . "\n"; } elsif ($directive eq 'include') { # Process include file. $contents->{'body'} .= $outbuf; $outbuf = ''; process_page($contents, "$dir/$arg", $dir); } elsif ($directive eq 'code') { # Wrap argument in block. $outbuf .= "$arg"; } elsif ($directive eq 'nomarkup') { # Pass the text straight through; no mark up or processing. $outbuf .= $arg; } elsif ($directive eq 'comment') { # Don't generate any output. } elsif ($directive eq 'bodyheader') { # Add the contents to the tag. $contents->{'bodyheader'} .= $arg . " "; } elsif ($directive eq 'atom_author') { $contents->{'atom_author'} = $arg; # Allow entry extraction, if request wants that. if ($contents->{'requested_format'} eq "atom") { &$start_feed(); } } elsif ($directive eq 'selflink_html') { $contents->{'selflink_html'} = $arg; } elsif ($directive eq 'selflink_atom') { $contents->{'selflink_atom'} = $arg; } elsif ($directive eq 'markup') { if ($arg eq 'off') { $markup_on = 0; } else { $markup_on = 1; } } elsif ($directive eq 'allow_comments') { $allow_comments = 1; } } elsif ($markup_on == 0) { # Markup is disabled; emit input lines as is. $outbuf .= $line; } elsif ($line =~ /^\\\\(.*)$/) { # Starts with two backslashes -- emit the rest of the line literally. $outbuf .= $1 . "\n"; } elsif ($para == 0 && $line =~ /^(-----)|(_____)/) { # Horizontal line -- ends & begins feed entries. if ($is_feed) { # Emit the current entry, if any. if ($outbuf =~ /[^\w]/ && $last_date ne '') { my $new_entry = { 'content' => escape_html($outbuf), 'updated' => $last_date, 'title' => $last_title, }; push @{ $contents->{'entries'} }, $new_entry; &$start_entry(); } $outbuf = ''; } else { if ($entry_id) { &$emit_entry_footer($entry_id); } $last_date = ''; $entry_id = ''; $outbuf .= "


    \n"; } } elsif ($para == 0 && $line =~ /^----([^-]+|$)/) { # Enter plain text mode. $plain_text = 1; $outbuf .= "
    \n" . process_plain_text($line);
            }
            elsif ($para == 0 && $line =~ /^ ----([^-]+|$)/) {
                # *Silently* enter plain text mode.
                $plain_text = 1;
                $outbuf .= "
    \n";
            }
            elsif ($line =~ /^\s*$/) {
                # Blank line.
    
                if ($para == 1) {
                    # End the paragraph.
                    $outbuf .= process_paragraph_text($para_text);
                    $para_text = '';
                    $outbuf .= "

    \n\n"; $para = 0; } } elsif ($line =~ /^( *)\* (.+)$/) { # Start of a bulleted item. $outbuf .= process_paragraph_text($para_text); $para_text = ''; if ($bullet_item == 1) { # Terminate leftover bullet item. $outbuf .= "

    \n"; $bullet_item = 0; } if ($bullet_list == 0) { # Start unordered list. $outbuf .= "
      \n"; $bullet_list = 1; } # Start bullet item. $outbuf .= "
    • "; $para_text .= $2; $bullet_item = 1; $para = 1; } else { # Line with text. # If we're making a feed, see if this is a date or title line. my $got_date = &$process_date_line($line); if ($is_feed && $got_date) { # Don't emit the date as part of the feed text, keep it separate. } elsif ($is_feed && &$process_title_line($line)) { # got a title } else { if ($got_date && ! $is_feed) { $entry_id = "d$last_date"; &$emit_entry_header($entry_id); } if ($bullet_item == 1) { if ($line =~ /^ .*/) { # line starts with at least a couple spaces; looks # like a new paragraph within the existing bullet. } else { # Terminate leftover bullet item. $outbuf .= process_paragraph_text($para_text); $para_text = ''; $outbuf .= "

    • \n"; $bullet_item = 0; } } if ($bullet_item == 0 && $bullet_list == 1) { # Terminate open bullet-list. $outbuf .= "
    \n"; $bullet_list = 0; } if ($para == 0) { if ($line =~ /^[\<]/) { # Looks like HTML. Pass it straight through. $outbuf .= $line; } else { # Start a paragraph. # one already. $outbuf .= process_paragraph_text($para_text); $para_text = ''; $outbuf .= "

    \n"; $para = 1; # Start the paragraph. $para_text .= $line; } } else { # We're in a paragraph, append the text. $para_text .= $line; } } } } # Flush any leftover text. $outbuf .= process_paragraph_text($para_text); $para_text = ''; if ($contents->{'requested_format'} eq "html") { $contents->{'body'} .= $outbuf; } close $in; } sub process_plain_text # Takes a string; returns a string. Escape HTML special characters. { my ($text) = @_; $text =~ s/\&/\&/g; $text =~ s/\>/\>/g; $text =~ s/\$2\<\/a\>/gsx; # Turn bare http:// references into active links. while ($text =~ s/ (^|[^\"]) (http:\/\/(\.?[\w\d\~\%\?\&\/\-\=\_]+)+) (\.?([\s\)\,\n]|(\<\/p))) /$1$2<\/a>$4/x) {} # Escape "free" '>' chars while ($text =~ s/\s\>\s/ \> /) {} # Escape "free" '<' chars while ($text =~ s/\s\<\s/ \< /) {} # Escape "free" '&' chars while ($text =~ s/\s\&\s/ \& /) {} # Fancy double quotes: LaTeX style replacement of `` with left quote while ($text =~ s/\`\`/\“/) {} # Fancy double quotes: LaTeX style replacement of '' with right quote while ($text =~ s/\'\'/\”/) {} # ’ == right single quote # ‘ == left single quote # Translate *bold text* into HTML while ($text =~ s/ \*([\w\d]([\w\d\s\,\-\.\'\"\:\&\;\/]*[\w\d\!\?]))\* ([\s\.\,\)]) /\$1$3\<\/b\>/x) {} # Translate /italic text/ into HTML while ($text =~ s/ (^|[^\:\<\>]) \/ ([\w\d\(\[\"\'] [^\/]+) \/ ([\s\)\]]|([\.\!\?]\s)|([\"\'][^\<\>])) /$1\$2\<\/i\>$3/x) {} # Do combinations of bold/underline/italic? Like _*blah blah*_ # Find things that look like: # # Some Minor Heading # ------------------ # And turn them into sub-headings while ($text =~ s/(^|\n)(\w.+)\n\-\-+\n/\n

    $1$2<\/b><\/h3>\n/) {} # Find things that look like: # # Some Heading # ============ # # And turn them into section headings. while ($text =~ s/(^|\n)(\w.+)\n\=\=+\n/\n

    $1$2<\/b><\/h2>\n/) {} # Find -----'s and turn them into horizontal lines. while ($text =~ s/(^|\n)------*\s*(\n|$)/$1
    $2/) {} # Find _____'s and turn them into horizontal lines. while ($text =~ s/(^|\n)______*\s*(\n|$)/$1
    $2/) {} # Translate _underlined text_ into HTML underlined text while ($text =~ s/ _([a-z\d]([a-z\d\s\-\,\:]*[a-z\d]))_ ([\s\.\,\:]) /\$1\<\/u\>$3/sxi) { } return $pre_text . $text; } # # Somewhat mysterious magic to open a local filehandle. See Camel # book. # sub newopen { my $path = shift; local *FH; # not my! open (FH, $path) || return undef; return *FH; }