Wednesday, January 16, 2013

PDFTribute.net PDF Link Scraper - Command Line PHP Tool

[sourcecode language="php"]
function unshorten_url($url) {
$ch = curl_init($url);
curl_setopt_array($ch, array(
CURLOPT_FOLLOWLOCATION => TRUE,
CURLOPT_RETURNTRANSFER => TRUE,
CURLOPT_SSL_VERIFYHOST => FALSE,
CURLOPT_SSL_VERIFYPEER => FALSE,
));
curl_exec($ch);
$url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
curl_close($ch);
return $url;
}

$target_url = "http://pdftribute.net/"; // Simply add page number for a specific page ie pdftribute.net/3
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';

$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html= curl_exec($ch);

$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
$urls = array();
$pdfs = array();

for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
$urls[] = $url;
}

foreach ($urls as $url)
if (substr(unshorten_url($url), -3, 3) == 'pdf') $pdfs[] = unshorten_url($url);

print_r($pdfs);
[/sourcecode]

No comments:

Post a Comment