Changeset 2423
- Timestamp:
- 02/02/08 14:23:25
- Files:
-
- Web-Scraper/branches/libxml/TODO (added)
- Web-Scraper/branches/libxml/lib/Web/Scraper.pm (modified) (8 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
Web-Scraper/branches/libxml/lib/Web/Scraper.pm
r2401 r2423 38 38 } 39 39 40 our $UseLibxml = 0; 41 42 sub use_libxml { 43 my $self = shift; 44 $UseLibxml = shift if @_; 45 $UseLibxml; 46 } 47 48 sub XML::LibXML::Element::attr { 49 my $self = shift; 50 $self->getAttribute(shift); 51 } 52 40 53 sub define { 41 54 my($class, $coderef) = @_; … … 51 64 my $self = shift; 52 65 my($stuff, $current) = @_; 53 54 66 my($html, $tree); 55 67 … … 74 86 } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) { 75 87 $tree = $stuff->clone; 88 } elsif (blessed($stuff) && $stuff->isa('XML::LibXML::Element')) { 89 $html = $stuff->toString; 76 90 } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') { 77 91 $html = $$stuff; … … 80 94 } 81 95 96 if ($self->use_libxml) { 97 eval { require XML::LibXML; }; 98 $self->use_libxml(0) if ($@); 99 } 100 82 101 $tree ||= do { 83 my $t = HTML::TreeBuilder::XPath->new; 84 $t->parse($html); 85 $t; 102 if ($self->use_libxml) { 103 my $parser = XML::LibXML->new(); 104 $parser->recover(1); 105 $parser->recover_silently(1); 106 $parser->keep_blanks(0); 107 $parser->expand_entities(1); 108 my $dom = $parser->parse_html_string($html); 109 $dom; 110 } else { 111 my $t = HTML::TreeBuilder::XPath->new; 112 $t->parse($html); 113 $t; 114 } 86 115 }; 87 116 … … 108 137 109 138 my $ret = $self->{code}->($tree); 110 $tree->delete; 139 unless ($self->use_libxml) { 140 $tree->delete; 141 } 111 142 112 143 # check user specified return value … … 165 196 return $val->scrape($node, $uri); 166 197 } elsif ($val =~ s!^@!!) { 167 my $value = $node->attr($val);198 my $value = $UseLibxml ? $node->getAttribute(lc($val)) : $node->attr($val); 168 199 if ($uri && is_link_element($node, $val)) { 169 200 require URI; … … 172 203 return $value; 173 204 } elsif (lc($val) eq 'content' || lc($val) eq 'text') { 174 return $node->isTextNode ? $node->string_value : $node->as_text; 205 if ($UseLibxml) { 206 return $node->textContent; 207 } else { 208 return $node->isTextNode ? $node->string_value : $node->as_text; 209 } 175 210 } elsif (lc($val) eq 'raw' || lc($val) eq 'html') { 176 if ($node->isTextNode) { 177 if ($HTML::TreeBuilder::XPath::VERSION < 0.09) { 178 return HTML::Entities::encode($node->as_XML, q("'<>&)); 179 } else { 180 return $node->as_XML; 211 my $html; 212 unless ($UseLibxml) { 213 if ($node->isTextNode) { 214 if ($HTML::TreeBuilder::XPath::VERSION < 0.09) { 215 return HTML::Entities::encode($node->as_XML, q("'<>&)); 216 } else { 217 return $node->as_XML; 218 } 181 219 } 182 } 183 my $html = $node->as_XML; 220 $html = $node->as_XML; 221 } else { 222 $html = $node->toString; 223 } 184 224 $html =~ s!^<.*?>!!; 185 225 $html =~ s!\s*</\w+>\n*$!!; … … 245 285 sub is_link_element { 246 286 my($node, $attr) = @_; 247 my $link_elements = $HTML::Tagset::linkElements{$node->tag} || []; 287 my $tag; 288 if (blessed($node) && $node->isa('XML::LibXML::Element')) { 289 $tag = $node->localname; 290 } else { 291 $tag = $node->tag; 292 } 293 my $link_elements = $HTML::Tagset::linkElements{$tag} || []; 248 294 for my $elem (@$link_elements) { 249 295 return 1 if $attr eq $elem;
