Screen scraping with Perl, HTML::TreeBuilder

perl — Tags: , — nik @ 6:41 pm

I put together a simple screen scraper using HTML::TreeBuilder in Perl. I am new to Perl and had a bit of a tricky time getting code available online to do what I wanted it to do, so after a bunch of trial and error I ended up with this. The script gathers 1000 results for “Doogie Howser, M.D.” from Google and then parses out the title, link, and summary for each result. As you can see, I used HTML::TreeBuilder to pick out specific instances of CSS styles in the HTML.

#!/usr/bin/perl

#screenscraping with HTML::TreeBuilder

use strict;
use LWP;
use HTML::TreeBuilder;

my($user_agent, $url, $browser, $request, $max_result, $response, $start,
$content, $search_result, @search_results);

$max_result = 1000;
$start = 0;

#Google for "Doogie Howser, M.D.", 100 results per page.
$url = 'http://www.google.com/search?hl=en&q=Doogie+Howser%2C+M.D.&aq=f&oq=&aqi=&num=100';
$user_agent = LWP::UserAgent->new();

print "Searching...this may take a minute\n";
while ( $start < $max_result ) {
	$request = HTTP::Request->new(GET => $url."&start=$start");
	$user_agent->timeout(30);
	$user_agent->agent('Mozilla/5.0');
	$response = $user_agent->request($request);
	if($response->is_success){
		$content .= $response->content;
		$start += 100;
	}
}

#parse $content with treebuilder
my $page = HTML::TreeBuilder->new();
$page->parse($content);
$page->eof();

#the following code finds every <li> item with class 'g' -- right now,
#this is how search results are styled on google. so, the HTML result
#for each item returned by google is stored in @search_results
#
#you could uncomment this:
#foreach $search_result (@search_results){
# print $search_result->as_HTML,"\n\n";
#}
#and see how each item looks in the array.

@search_results= $page->look_down(
sub{ $_[0]-> tag() eq 'li' and ($_[0]->attr('class') =~ /g/)}
);

foreach $search_result (@search_results){
	my($url, $title, $summary);

	#now that we have each HTML chunk of search results stored in an array
	#we can take it apart further:

	$page = HTML::TreeBuilder->new_from_content($search_result->as_HTML);

	#the title is styled as <h3 class=r>
	$title = $page->look_down(
	sub{ $_[0]-> tag() eq 'h3' and ($_[0]->attr('class') =~ /r/)}
	);
	#the summary is styled as <div class=s>
	$summary = $page->look_down(
	sub{ $_[0]-> tag() eq 'div' and($_[0]->attr('class') =~ /s/)}
	);

	#now we have to get the href attribute from the title to get the link
	#so we load the title, as HTML, into the treebuilder object
	$page = HTML::TreeBuilder->new_from_content($title->as_HTML);

	#the link is styled as <a class=l href="..."
	#the following assigns $url the href attribute
	$page->look_down(
	sub{ $_[0]-> tag() eq 'a' and
	($_[0]->attr('class') =~ /l/), $url = $_[0]->attr('href')}
	);

	#print everything out...
	if($title) { print 'title: '.$title->as_text."\n";}
	if($summary){ print 'summary: '.$summary->as_text."\n";}
	if($url){ print 'url: '.$url."\n\n";}
}

#delete the treebuilder object.
$page->delete;

1 Comment »

  1. nice. excited to use this, although not sure what for yet.

    Comment by nick — July 8, 2009 @ 1:55 pm

RSS feed for comments on this post. TrackBack URI

Leave a comment

This work is licensed under a Creative Commons Attribution-Noncommercial-Share Alike 3.0 Unported License.
(c) 2010 nik’s blog | powered by WordPress with Barecity