#!/usr/local/bin/perl -T
#
# textweb.pl -- Minimal text-to-html markup processor.
#
# This program has been donated to the Public Domain by Thatcher
# Ulrich, http://tulrich.com . Do whatever you want with it.
#
# The line below starts a block comment.
my $COMMENT = < \n";
print "\n";
exit(1);
}
#
# Process the page's contents.
#
$page_contents{'body'} = '';
$page_contents{'title'} = "textweb";
$page_contents{'head'} = '';
$page_contents{'bodyheader'} = '';
$page_contents{'requested_format'} = "html";
if ($q->param('format') && $q->param('format') eq "atom") {
$page_contents{'requested_format'} = "atom";
}
$page_contents{'format'} = $page_contents{'requested_format'};
$page_contents{'entries'} = [];
$page_contents{'selflink_atom'} = $q->url(-base=>1) . "/" . reextend($path, ".xml");
$page_contents{'selflink_html'} = $q->url(-base=>1) . "/" . reextend($path, ".html");
process_page(\%page_contents, $path, $base);
#
# Output the content.
#
if ($page_contents{'format'} eq "atom") {
# ATOM feed output.
my $entries = $page_contents{'entries'};
print "Content-type: application/atom+xml\n\n";
# print "Content-type: text/html\n\n";
print "\n";
print "
* Numbered lists get turned into HTML
(@@ not yet!)
* Line-break separated paragraphs get denoted in HTML
* URL's get turned into links.
* Any line that starts with exactly four dashes "----" toggles plain
text mode. All text through the next line starting in four dashes
is rendered as plain text. Use this to quote code, for example.
* Any line that start with a single space followed by four dashes also
toggles plain text mode, but does not output the current line. Use
this to "silently" exit or enter plain text mode.
* Any line that consists of nothing but five or more dashes ("-----")
gets turned into a horizontal bar.
* Things that look like section headings (underlined with '='
characters) get rendered as big bold headings (
).
* Things that look like minor section headings (underlined with '-'
characters) get rendered as smaller bold headings (
).
* Special directives:
* \include{filename.txt} -- include the specified file (and apply
processing)
* \title{some text} -- set the page title
* \head{some html} -- insert some stuff into the page's tag (e.g. CSS styles)
* \code{some literal text}
* \comment{some comment text that gets filtered out}
* \atom_author{My Name} -- use this to declare the file as a blog, and set the author
for exporting an Atom feed.
* \\text... -- don't process this text.
* Blog instructions: write your blog like this:
[.... misc header formatting ...]
\atom_author{Your Name Here}
------------------------------------------------
15 Jan 2002
*I'm Feeling Funny*
I ate something last night, and now I feel funny -- was it the raw
clams, or the fried caterpillars?
------------------------------------------------
And so on -- you separate your posts with horizontal lines. To get an Atom feed of your
blog, link to "http://yoursite.org/yourpage.xml"
TODO:
* Clean it up; it's a mess
* Better ability to protect text from markup (e.g. code snippets),
especially inline.
* TWiki-style [[url][description]] tags are handy, but don't look good
or intuitive in the source .txt. Come up with something better.
* Idea: put hidden URLs in parentheses. The link text is the
previous word. Could also insert an explicit [link] word or
something.
* Make different weight headings by looking at whether '===' or '---'
is used to underline the heading text.
* Something clever for tables? Would be nice to recognize fixed-pitch
ASCII-art tables and turn 'em into HTML.
* Ability to pull source document from any URL. Use this to view (and
test) random .txt documents on the web.
Send comments, flames, kudos and bug fixes to me, tu\@tulrich.com .
END_OF_COMMENT
use strict;
use warnings;
use CGI;
use Cwd;
# Helper; re-extend a filename with a new given extension.
sub reextend {
my ($fn, $new_ext) = @_;
if ($fn =~ /^(.*)\.txt$/) {
return $1 . $new_ext;
}
return $fn;
}
# Helper; escape html so it's suitable for embedding in Atom text fields.
sub escape_html {
my ($text) = @_;
# Escape all '&' chars
$text =~ s/\&/\&/g;
# Escape all '<' chars
$text =~ s/\\</g;
return $text;
}
# Helper; decode a text month to a number.
sub decode_month {
my ($in) = @_;
if ($in =~ /^jan/i) { return 1; }
elsif ($in =~ /^feb/i) { return 2; }
elsif ($in =~ /^mar/i) { return 3; }
elsif ($in =~ /^apr/i) { return 4; }
elsif ($in =~ /^may/i) { return 5; }
elsif ($in =~ /^jun/i) { return 6; }
elsif ($in =~ /^jul/i) { return 7; }
elsif ($in =~ /^aug/i) { return 8; }
elsif ($in =~ /^sep/i) { return 9; }
elsif ($in =~ /^oct/i) { return 10; }
elsif ($in =~ /^nov/i) { return 11; }
elsif ($in =~ /^dec/i) { return 12; }
return 0;
}
sub encode_date {
# Args should be numeric.
my ($year, $month, $day) = @_;
if ($year == 0 || $month == 0 || $day == 0) {
# Not a valid date.
return '';
}
if ($month > 12) { return ''; }
if ($day > 31) { return ''; }
if ($year < 100) {
$year += 2000; # 2 digit year!
}
return sprintf("%04d-%02d-%02dT00:00:00Z", $year, $month, $day);
}
sub get_google_friendconnect_comment_div {
my ($entry_id) = @_;
my $text = <
Not Found
\n";
print "The requested URL $path was not found on this server. block.
$outbuf .= "
$arg
";
}
elsif ($directive eq 'nomarkup') {
# Pass the text straight through; no mark up or processing.
$outbuf .= $arg;
}
elsif ($directive eq 'comment') {
# Don't generate any output.
}
elsif ($directive eq 'bodyheader') {
# Add the contents to the tag.
$contents->{'bodyheader'} .= $arg . " ";
}
elsif ($directive eq 'atom_author') {
$contents->{'atom_author'} = $arg;
# Allow entry extraction, if request wants that.
if ($contents->{'requested_format'} eq "atom") {
&$start_feed();
}
}
elsif ($directive eq 'selflink_html') {
$contents->{'selflink_html'} = $arg;
}
elsif ($directive eq 'selflink_atom') {
$contents->{'selflink_atom'} = $arg;
}
elsif ($directive eq 'markup') {
if ($arg eq 'off') {
$markup_on = 0;
} else {
$markup_on = 1;
}
}
elsif ($directive eq 'allow_comments') {
$allow_comments = 1;
}
} elsif ($markup_on == 0) {
# Markup is disabled; emit input lines as is.
$outbuf .= $line;
} elsif ($line =~ /^\\\\(.*)$/) {
# Starts with two backslashes -- emit the rest of the line literally.
$outbuf .= $1 . "\n";
} elsif ($para == 0 && $line =~ /^(-----)|(_____)/) {
# Horizontal line -- ends & begins feed entries.
if ($is_feed) {
# Emit the current entry, if any.
if ($outbuf =~ /[^\w]/ && $last_date ne '') {
my $new_entry = {
'content' => escape_html($outbuf),
'updated' => $last_date,
'title' => $last_title,
};
push @{ $contents->{'entries'} }, $new_entry;
&$start_entry();
}
$outbuf = '';
} else {
if ($entry_id) {
&$emit_entry_footer($entry_id);
}
$last_date = '';
$entry_id = '';
$outbuf .= "
\n";
}
}
elsif ($para == 0 && $line =~ /^----([^-]+|$)/) {
# Enter plain text mode.
$plain_text = 1;
$outbuf .= "\n" . process_plain_text($line);
}
elsif ($para == 0 && $line =~ /^ ----([^-]+|$)/) {
# *Silently* enter plain text mode.
$plain_text = 1;
$outbuf .= "
\n";
}
elsif ($line =~ /^\s*$/) {
# Blank line.
if ($para == 1) {
# End the paragraph.
$outbuf .= process_paragraph_text($para_text);
$para_text = '';
$outbuf .= "
"; $para_text .= $2; $bullet_item = 1; $para = 1; } else { # Line with text. # If we're making a feed, see if this is a date or title line. my $got_date = &$process_date_line($line); if ($is_feed && $got_date) { # Don't emit the date as part of the feed text, keep it separate. } elsif ($is_feed && &$process_title_line($line)) { # got a title } else { if ($got_date && ! $is_feed) { $entry_id = "d$last_date"; &$emit_entry_header($entry_id); } if ($bullet_item == 1) { if ($line =~ /^ .*/) { # line starts with at least a couple spaces; looks # like a new paragraph within the existing bullet. } else { # Terminate leftover bullet item. $outbuf .= process_paragraph_text($para_text); $para_text = ''; $outbuf .= "
\n";
$para = 1;
# Start the paragraph.
$para_text .= $line;
}
} else {
# We're in a paragraph, append the text.
$para_text .= $line;
}
}
}
}
# Flush any leftover text.
$outbuf .= process_paragraph_text($para_text);
$para_text = '';
if ($contents->{'requested_format'} eq "html") {
$contents->{'body'} .= $outbuf;
}
close $in;
}
sub process_plain_text
# Takes a string; returns a string. Escape HTML special characters.
{
my ($text) = @_;
$text =~ s/\&/\&/g;
$text =~ s/\>/\>/g;
$text =~ s/\\</g;
# Anything else? Should we escape quotes?
return $text;
}
sub process_paragraph_text
# Takes a string; returns a string. Does markup substitutions to make
# plain text markup look more like HTML.
{
my ($text) = @_;
#
# Do text substitutions. These are digusting -- TODO clean them up.
#
my $pre_text = '';
$text .= ' '; # Put space at end, makes a few regexps simpler...
# Process [[http_link][description]]
$text =~ s/
\[\[
([^\]]+)
\]\[
([^\]]+)
\]\]
/\$2\<\/a\>/gsx;
# Turn bare http:// references into active links.
while ($text =~ s/
(^|[^\"])
(http:\/\/(\.?[\w\d\~\%\?\&\/\-\=\_]+)+)
(\.?([\s\)\,\n]|(\<\/p)))
/$1$2<\/a>$4/x)
{}
# Escape "free" '>' chars
while ($text =~ s/\s\>\s/ \> /) {}
# Escape "free" '<' chars
while ($text =~ s/\s\<\s/ \< /) {}
# Escape "free" '&' chars
while ($text =~ s/\s\&\s/ \& /) {}
# Fancy double quotes: LaTeX style replacement of `` with left quote
while ($text =~ s/\`\`/\“/) {}
# Fancy double quotes: LaTeX style replacement of '' with right quote
while ($text =~ s/\'\'/\”/) {}
# ’ == right single quote
# ‘ == left single quote
# Translate *bold text* into HTML
while ($text =~ s/
\*([\w\d]([\w\d\s\,\-\.\'\"\:\&\;\/]*[\w\d\!\?]))\*
([\s\.\,\)])
/\$1$3\<\/b\>/x)
{}
# Translate /italic text/ into HTML
while ($text =~ s/
(^|[^\:\<\>])
\/
([\w\d\(\[\"\']
[^\/]+)
\/
([\s\)\]]|([\.\!\?]\s)|([\"\'][^\<\>]))
/$1\$2\<\/i\>$3/x)
{}
# Do combinations of bold/underline/italic? Like _*blah blah*_
# Find things that look like:
#
# Some Minor Heading
# ------------------
# And turn them into sub-headings
while ($text =~ s/(^|\n)(\w.+)\n\-\-+\n/\n$1$2<\/b><\/h3>\n/) {}
# Find things that look like:
#
# Some Heading
# ============
#
# And turn them into section headings.
while ($text =~ s/(^|\n)(\w.+)\n\=\=+\n/\n
$1$2<\/b><\/h2>\n/) {}
# Find -----'s and turn them into horizontal lines.
while ($text =~ s/(^|\n)------*\s*(\n|$)/$1
$2/) {}
# Find _____'s and turn them into horizontal lines.
while ($text =~ s/(^|\n)______*\s*(\n|$)/$1
$2/) {}
# Translate _underlined text_ into HTML underlined text
while ($text =~ s/
_([a-z\d]([a-z\d\s\-\,\:]*[a-z\d]))_
([\s\.\,\:])
/\$1\<\/u\>$3/sxi)
{
}
return $pre_text . $text;
}
#
# Somewhat mysterious magic to open a local filehandle. See Camel
# book.
#
sub newopen {
my $path = shift;
local *FH; # not my!
open (FH, $path) || return undef;
return *FH;
}