させざきの日記

 | 

2007-12-02

<?php
//Diggin_Scraper_Simpleクラスは「前の日」においてます
//@see http://moshican.g.hatena.ne.jp/sasezaki/20071201
require_once '/Diggin/Scraper/Simple.php';
require_once 'Zend/Http/Client.php';
require_once 'Zend/Http/Client/Adapter/Test.php';

$adapter = new Zend_Http_Client_Adapter_Test();
$source = <<<EOF
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>Twitter / aaa

aa</title>
<title>Twi</title>
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="http://twitter.com/statuses/friends_timeline/xxxxx.rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="/rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="/?rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="../rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="rss.xml" />  
</head>/rss
<body>
<img id="id" src="/images/button.gif" alt="gazou" />
<img id="id" src="../images/button.gif" alt="gazou" />
<img id="id" src="button.gif" alt="gazou" />
</body>
</html>
EOF;

$adapter->setResponse(
    "HTTP/1.1 200 OK"        . "\r\n" .
    "Content-type: text/xml" . "\r\n" .
                               "\r\n" .
    $source);
$url = "http://www.example.org/hoge.do";
$url2 = "http://www.example.org/hoge/do.php/test?test#test";
$client = new Zend_Http_Client($url, array('adapter' => $adapter));
$client2 = new Zend_Http_Client($url2, array('adapter' => $adapter));

$scraper = new Diggin_Scraper_Simple();
$scraper->setHttpClient($client);
//$scraper->scrape('head/link[@type="application/rss+xml"]');
print_r($scraper->getTitle());
print_r($scraper->discovery());
$scraper->setUrl($url2);
print_r($scraper->discovery());

print_r($scraper->discovery("//img","src"));

結果

Twitter / aaa aa

Array

(

[0] => http://twitter.com/statuses/friends_timeline/xxxxx.rss

[1] => http://www.example.org/rss

[2] => http://www.example.org/?rss

[4] => http://www.example.org/rss.xml

)

Array

(

[0] => http://twitter.com/statuses/friends_timeline/xxxxx.rss

[1] => http://www.example.org/rss

[2] => http://www.example.org/?rss

[3] => http://www.example.org/hoge/rss

[4] => http://www.example.org/hoge/do.php/rss.xml

)

Array

(

[0] => http://www.example.org/images/button.gif

[1] => http://www.example.org/hoge/images/button.gif

[2] => http://www.example.org/hoge/do.php/button.gif

)

 |