One Hat Cyber Team

View File Name : deHTML.pm

package Razor2::Preproc::deHTML;

sub new {

    my $class = shift;

    my %html_tags = (
        "lt"     => '<',      "gt"     => '>',      "amp"    => '&',
        "quot"   => '"',      "nbsp"   => ' ',      "iexcl"  => chr(161),
        "cent"   => chr(162), "pound"  => chr(163), "curren" => chr(164),
        "yen"    => chr(165), "brvbar" => chr(166), "sect"   => chr(167),
        "uml"    => chr(168), "copy"   => chr(169), "ordf"   => chr(170),
        "laquo"  => chr(171), "not"    => chr(172), "shy"    => chr(173),
        "reg"    => chr(174), "macr"   => chr(175), "deg"    => chr(176),
        "plusmn" => chr(177), "sup2"   => chr(178), "sup3"   => chr(179),
        "acute"  => chr(180), "micro"  => chr(181), "para"   => chr(182),
        "middot" => chr(183), "cedil"  => chr(184), "sup1"   => chr(185),
        "ordm"   => chr(186), "raquo"  => chr(187), "frac14" => chr(188),
        "frac12" => chr(189), "frac34" => chr(190), "iquest" => chr(191),
        "Agrave" => chr(192), "Aacute" => chr(193), "Acirc"  => chr(194),
        "Atilde" => chr(195), "Auml"   => chr(196), "Aring"  => chr(197),
        "AElig"  => chr(198), "Ccedil" => chr(199), "Egrave" => chr(200),
        "Eacute" => chr(201), "Ecirc"  => chr(202), "Euml"   => chr(203),
        "Igrave" => chr(204), "Iacute" => chr(205), "Icirc"  => chr(206),
        "Iuml"   => chr(207), "ETH"    => chr(208), "Ntilde" => chr(209),
        "Ograve" => chr(210), "Oacute" => chr(211), "Ocirc"  => chr(212),
        "Otilde" => chr(213), "Ouml"   => chr(214), "times"  => chr(215),
        "Oslash" => chr(216), "Ugrave" => chr(217), "Uacute" => chr(218),
        "Ucirc"  => chr(219), "Uuml"   => chr(220), "Yacute" => chr(221),
        "THORN"  => chr(222), "szlig"  => chr(223), "agrave" => chr(224),
        "aacute" => chr(225), "acirc"  => chr(226), "atilde" => chr(227),
        "auml"   => chr(228), "aring"  => chr(229), "aelig"  => chr(230),
        "ccedil" => chr(231), "egrave" => chr(232), "eacute" => chr(233),
        "ecirc"  => chr(234), "euml"   => chr(235), "igrave" => chr(236),
        "iacute" => chr(237), "icirc"  => chr(238), "iuml"   => chr(239),
        "eth"    => chr(240), "ntilde" => chr(241), "ograve" => chr(242),
        "oacute" => chr(243), "ocirc"  => chr(244), "otilde" => chr(245),
        "ouml"   => chr(246), "divide" => chr(247), "oslash" => chr(248),
        "ugrave" => chr(249), "uacute" => chr(250), "ucirc"  => chr(251),
        "uuml"   => chr(252), "yacute" => chr(253), "thorn"  => chr(254),
        "yuml"   => chr(255)
    );

    return bless {
        html_tags => \%html_tags,
    }, $class;

}

sub isit {

    my ( $self, $text ) = @_;
    my $isit = 0;
    my ( $hdr, $body ) = split /\n\r*\n/, $$text, 2;

    return 0 unless $body;

    $isit = $body =~ /(?:<HTML>|<BODY|<FONT|<A HREF)/ism;
    return $isit if $isit;

    $isit = $hdr =~ m"^Content-Type: text/html"ism;
    return $isit;

}

# NOTE: Tag designations _are_ case sensitive.
# Returns: (length of tag detected, char)
# So the caller can basically do ``$i += $length_detected'', above.
#
# The big difference between this function and its C equivalent is
# that we're not modifying the char array that was passed in, but
# rather instead advising the caller on how much to skip/eat.

sub html_xlat_old {

    my ( $self, $chars, $i ) = @_;
    my ( $tag, $val );
    my ($r_val);
    my $r_tag = "";

    return 0 if ( $$chars[$i] !~ /[a-zA-Z]/ );

    # first figure out which is shorter, $i->EOS, or $i+10, and use
    # that to build a compare string for use with substr.  Otherwise,
    # requesting ``lengths'' from char arrays that exceeds the actual
    # array produce additional 'undef's.  10 is an arbitrary #, but at
    # least greater than the max length (+ 1) of any html &tag.

    my ($offset) = ( scalar @{$chars} - $i > 10 ? 10 : scalar @{$chars} );
    my ($s) = join( '', @{$chars}[ $i .. $i + $offset ] );

    while ( ( $tag, $val ) = each %{ $self->{html_tags} } ) {

        if ( substr( $s, 0, length($tag) ) eq $tag ) {
            ( $r_tag, $r_val ) = ( $tag => $val );
            $r_tag .= ';'    # so the length($r_tag) consumes the ``;''
              if ( substr( $s, length($tag), 1 ) eq ';' );
        }
    }

    return ( length($r_tag), $r_val );

}

sub html_xlat {
    my ( $self, $chars, $i ) = @_;

    #print "html_xlat($r_tag) start\n";
    return 0 if ( $$chars[$i] !~ /[a-zA-Z]/ );

    my $r_tag;

    # we used to walk till we got a ';', but to be compatible
    # with c, we won't check for ';'
    while ( $$chars[$i] && $$chars[$i] =~ /[a-zA-Z]/ ) {
        $r_tag .= $$chars[ $i++ ];
    }
    my $len = length($r_tag);    # do not include ;
    $len++ if ( $$chars[$i] eq ';' );

    my $val = $self->{html_tags}->{$r_tag};

    #print "html_xlat($r_tag) = ($len,$val)\n";

    return 0 unless $val;        # not found
    return ( $len, $val );
}

sub doit {

    my ( $self, $text ) = @_;
    my ( $hdr, $body ) = split /\n\r*\n/, $$text, 2;

    my (@chars) = split //, $body;
    my ($len) = scalar @chars;

    my ( $last, $quote, $sgml, $tag ) = ( "", "", "", "" );
    my (@out);
    my ($i) = 0;

    while ( $i < $len ) {
        my ($c) = $chars[ $i++ ];

        if ( $c eq $quote ) {

            if ( $c eq '-' && $last ne '-' ) {
                $last = $c;
                next;
            }
            else {
                $last = 0;
            }

            $quote = "";

        }
        elsif ( !$quote ) {

            if ( $c eq '<' ) {

                $tag = 1;
                if ( $chars[ $i++ ] eq '!' ) {
                    my ($s) = join( '', @chars[ $i .. $i + 10 ] );
                    $sgml = 1;
                }

            }
            elsif ( $c eq '>' ) {

                if ($tag) {

                    $sgml = 0;
                    $tag  = 0;
                }

            }
            elsif ( $c eq '-' ) {

                if ( $sgml and $last eq '-' ) {

                    $quote = '-';
                }
                else {
                    push @out, $c if ( !$tag );
                }

            }
            elsif ( $c eq "\"" or $c eq "\'" ) {

                if ($tag) {
                    $quote = $c;
                }
                else {
                    push @out, $c if ( !$tag );
                }

            }
            elsif ( $c eq "&" ) {

                my ( $len, $char ) = $self->html_xlat( \@chars, $i );
                if ($len) {
                    push @out, $char;
                    $i += $len;
                }
                else {
                    push @out, $c;
                }

            }
            else {

                push @out, $c if ( !$tag );

            }

        }

        $last = $c;
    }

    $$text = "$hdr\n\n" . join( '', @out );

}

1;