#
# $Id$
#
use Text::Wrap;
use Data::Dumper;
$Text::Wrap::columns = 76;
$Text::Wrap::unexpand = 0;
$MAGIC_NBSP = "\xff";
sub unentity
{
my $html = shift;
die ( "INTERNAL ERROR: magic nbsp code found in html" ) if ( $html =~ /$MAGIC_NBSP/ );
$html =~ s/ /$MAGIC_NBSP/g;
$html =~ s/<//g;
$html =~ s/“/<>/g;
$html =~ s/©/(c)/g;
$html =~ s/&/&/g;
return $html;
};
sub linify
{
my $text = shift;
$text =~ s/\n/ /g;
$text =~ s/\t/ /g;
$text =~ s/\s{2,}/ /g;
$text =~ s/^\s+//ms;
$text =~ s/\s+$//ms;
$text =~ s/\s+(\w)\s+/ \1$MAGIC_NBSP/g; # glue 1-letter words with nbsp's
return $text;
};
sub in
{
my ($a,$v) = @_;
foreach my $e ( @$a )
{
return 1 if $e eq $v;
}
return 0;
};
###############
## entry point
###############
undef $/;
$html = <>;
$html =~ s/.*?
//ms;
$html =~ s/<\/body.*/\n--eof--\n/ms;
$res = "";
$acc = "";
$hdr = "";
$left = 0;
$red = "";
@lists = "";
$li = 0;
$pre = 0;
while ( $html =~ s/^(.*?)<([\/]*\w+)(\s+.*?)*>//ms )
{
$text = $1;
$tag = lc $2;
$attrs = lc $3;
$acc .= $text;
next if ( in ( [ "a","/a","i","/i","p","code","/code","b","/b","span","/span",
"strong","/strong","br","em","/em" ], $tag ) );
if ( in ( [ "div","/div","/h1","/h2","/h3","/h4","/h5","/p","hr","h5",
"dl","ol","ul","dt","dd","/dl","/ol","/ul","/dt","/dd",
"li","/li","pre","/pre" ], $tag ) )
{
# fixme! handle userinput instead of literallayout
if ( !$pre )
{
$acc = linify(unentity($acc));
$acc = wrap ( (" " x $left).$red, " "x($left+length($red)), $acc );
die if ( $acc =~ /\t/ );
} else
{
my @lines = split ( /\n/, unentity($acc) );
$acc = "";
foreach $line ( @lines )
{
next if ( !$acc && !$line );
$acc .= ( " " x $left ) . "| $line\n";
}
$acc = "\n$acc\n";
}
$red = " "x length($red);
if ( $acc )
{
if ( length($hdr) )
{
$res .= "$acc\n" . $hdr x length($acc) . "\n\n";
$hdr = "";
} else
{
$res .= "$acc\n";
}
}
$acc = "";
$res .= "\n" if ( in ( [ "div","/div","/h1","/h2","/h3","/p","hr" ], $tag ) );
$res .= "\n" if ( $#lists<=2 && in ( ["/dl","/ol","/ul" ], $tag ) );
if ( $tag eq "hr" )
{
$res .= "-" x $Text::Wrap::columns . "\n\n";
}
if ( in ( [ "dd","ol","ul" ], $tag ) )
{
$left += 3;
$li = 1 if ( $tag eq "ol" ); # fixme! allow nested ol
push ( @lists, $tag );
next;
}
if ( in ( [ "/dd","/ol","/ul" ], $tag ) )
{
$left -= 3;
pop @lists;
$red = ""; # just in case
$res .= "\n";
next;
}
if ( $tag eq "li" )
{
my $list = $lists[$#lists];
if ( $list eq "ol" ) { $red = "$li. "; $li++; }
elsif ( $list eq "ul" ) { $red = "* "; }
else { print Dumper(@lists);
die ( "INTERNAL ERROR: 'li' in unknown list '$list'" ); }
next;
}
if ( $tag eq "pre" || ( $tag eq "div" && $attrs =~ /literallayout/ ) )
{
$pre = 1;
$left += 3;
next;
}
if ( $pre && ( $tag eq "/pre" || $tag eq "/div" ) )
{
$pre = 0;
$left -= 3;
next;
}
next;
}
if ( $tag eq "h1" ) { $hdr = "="; next; }
if ( $tag eq "h2" ) { $hdr = "="; next; }
if ( $tag eq "h3" ) { $hdr = "-"; next; }
if ( $tag eq "h4" ) { $hdr = ""; next; }
die ( "unknown tag='$tag' attrs='$attrs'\n" );
}
$res .= "\n\n--eof--\n";
$res =~ s/^\n+//;
$res =~ s/\n{3,}/\n\n/gms;
$res =~ s/$MAGIC_NBSP/ /g;
print $res;
#
# $Id$
#