package wlgmod::html;

# HTML word extract plugin

sub init {
    if(!eval "use HTML::Entities; 1") {
	return "Canont find module HTML::Entities (http://www.cpan.org/modules/index.html)";
    }
    else {
	return ""; # ok
    }
}

sub get_words {
    my $this = shift;
    my $filename = shift;
    my @words;

    open(FILE, "<$filename" ) || die "Cannot open $filename: $!";
    undef $/; # switch into 'slurp' mode
    
    my $content = <FILE>;
    my @list = split (/\s/,&get_text($content));
    foreach my $word (@list)
    {
	chomp $word;
	$word =~ s/\s+//mg;
	$word =~ s/[,.;:?]+//mg;
	
	if($word ne "") {
	    push @words, $word;
	}
    }
    close(FILE);
    
    return (@words);
}

# get_text subroutine takes a single argument, the content
#   of an HTML file. Returns the text version of it.
sub get_text{
  my $html = shift;

  my $new_html = '';

  # Remove the HTML comments first (stolen from perldoc HTML::Parser)
  my $parser = HTML::Parser->new(default_h => 
                                   [sub { my $element = shift; $new_html = $new_html.$element }, 'text'],
                                 comment_h => [""],
                                 );
  # Also, remove <script> and <style> stuff
  $parser->ignore_elements(qw(script style));
  $parser->parse($html);
  $parser->eof;

  $html = $new_html;

  $html =~ s/\r//gm;

  # replace <BR> and <P> tags with \n (also, </BR>,</P>, <P/>, <BR/> )
  $html =~ s/\<(\/)?(BR|P)(\s*\/)?\>/\n/mgi;
  # delete other HTML tags
  $html =~ s/\<[^>]+\>//mg;

  # Replace HTML entities with normal text
  $html = decode_entities($html);
  return $html;

}

1;
