どうも完成度がイマイチ(っていうか、まだどこまで使えるかわからん)なのでリリースするかどうか怪しいモジュール。この間読んだこれ、
http://blog.zuzara.com/2006/06/06/84/
もう少し親divとかにかぶらないようにできないかなぁ、と思ってちょっとほげほげしてみたのが以下のモジュール:
package HTML::ContentScraper; use strict; use HTML::TreeBuilder;sub new
{
my $class = shift;
my $self = bless {
threshold_length => 200,
threshold_tag_rate => 0.05
}, $class;
return $self;
}sub _elem
{
my $self = shift;
my $field = shift;
my $ret = $self->{$field};
if (@_) {
$self->{$field} = shift;
}
return $ret;
}
sub threshold_length { shift->_elem('threshold_length', @_) }
sub threshold_tag_rate { shift->_elem('threshold_tag_rate', @_) }
sub scrape_uri
{
my $self = shift;
my $uri = shift;require LWP::Simple;
$self->scrape_content(LWP::Simple::get($uri));
}sub scrape_content
{
my $self = shift;my $html = HTML::TreeBuilder->new_from_content($_[0]);
my @elems;
# Look for sections that are either contained in a TD or DIV
foreach my $candidate ($html->look_down(_tag => qr{^(?:td|div)$})) {
# in each section, look for the number of tags
my @tags = $candidate->look_down(_tag => qr(.));
my $text = $candidate->as_text;
my $len = length($text);
if ($len >= $self->{threshold_length}) {
if (scalar(@tags) / $len < $self->{threshold_tag_rate}) {
# make sure that this new element is not inside a previous
# container. if this happens, then we should pick the
# new element and toss the old one
my @l = $candidate->lineage;
my $f = 0;
for (my $i = 0; $i < @elems; $i++) {
foreach my $l (@l) {
if ($l->idf eq $elems[$i]->idf) {
splice(@elems, $i, 1);
$f = 1;
last;
}
}
last if $f;
}push @elems, $candidate;
}
}
}my @ret;
foreach my $e (@elems) { push @ret, $e->clone }
$html->delete;
return wantarray ? @ret : \@ret;
}1;
で、テストスクリプト
#!/usr/bin/perluse strict;
use lib("lib");
use HTML::ContentScraper;my $scraper = HTML::ContentScraper->new;
foreach my $uri (@ARGV) {
my @elems = $scraper->scrape_uri($uri);
print "HTML contents: ", scalar(@elems), "\n";
foreach my $e (@elems) {
print "=== START ===\n", $e->as_text, "=== END ===\n";
}
}
で、問題はこれをどう使うかだなぁ。いいアプリケーションあるかしらん。ちなみにMeCabをこれにかますのもどうかなぁ、とか思ってたり。