させざきの日記

2007-12-22

SimpleXMLのXpathにおけるstarts-withの扱い

http://d.hatena.ne.jp/shimooka/20071220/1198124082

http://moshican.g.hatena.ne.jp/sasezaki/20071220

コメント部分の続きですが、


SimpleXMLのXpathにおけるstarts-withの扱い

がうまくいくかという検証でいいんでしょうか。。。


http://diggin.musicrider.com/tests/startsWith.php

http://diggin.musicrider.com/tests/startsWith.phps

http://diggin.musicrider.com/tests/doc.xml

xrea(php 5.1.6)でもうまく行くようですが。。。

まったく別に問題点があるんでしょうか?

スイマセン、他に原因と思われる箇所が浮かびません。。

shimookashimooka2007/12/22 16:53わざわざありがとうございます。
今やると動く。うー。。。typoだった可能性が。。。>< すみません!

2007-12-20

MyMiniCity部のスクレイピングがブーム?

PHPで街を育てる - Do You PHP はてな

http://d.hatena.ne.jp/shimooka/20071220/1198124082

xpath使ってるなら、この場合preg_match使わないで

foreach ($xml->xpath('//div[@class="section"]//td[2]/a[starts-with(.,"http://")]') as $element) {
    $url = $element["href"];
    echo "request : $url ... "."\n";

で、いいような気がしなくもないかもしれない

てか、HTMLSax3ってPHP4かあ。Strictエラーがバカスカでてきたので気づいた。



[追記]あ、元のrubyのやつがそういう抽出方法だからあ

http://kawadash.in/junzou/scripts/myminicitygrower.rb

shimookashimooka2007/12/21 15:31tbありがとうございます。
ああ、そういう手がありましたか。すでに忘却の彼方。。。w

sasezakisasezaki2007/12/21 22:03せせこましいツッコミでどうもスイマセン

shimookashimooka2007/12/22 03:37いえいえ。貴重な情報をありがとうございました。
で、試してみたんですが、xpathのparseに失敗しちゃいました。。。orz
SimpleXMLで対応してないんですかねぇ。ちょっともったいない。

sasezakisasezaki2007/12/22 04:10あれ?自分の環境だとうまく行ったんですが。。
(WindowsXP + PHP5.2.4)
ちょっと後で出来たら確認します。

2007-12-18

コマンドライン(php)で対話形式で萌え情報をゲットする

PHP版をつくる試みその1


<?php
require_once 'Zend/Http/Client.php';
require_once 'Zend/Validate/LessThan.php';

class Diggin_Console_Interactive
{
    /**
     * @param string $xmlStr
     * @param string $xpathValue
     * @param string $xpathQuery
     * @param string $showMessage
     * @return string
     */
    function select ($xmlStr, $xpathValue, $xpathQuery, $showMessage = "選択してください")
    {
        
        $iterator = new SimpleXMLIterator($xmlStr);
        $find = $iterator->xpath($xpathValue);
        $hit = count($find);
        
        foreach ($find as $key => $value){
            echo mb_convert_encoding($key, 'SJIS', 'utf8');
            echo ":";
            echo mb_convert_encoding($value, 'SJIS', 'utf8');
            echo "\n";
        }  
        
        $validator = new Zend_Validate_LessThan($hit);
        
        while (TRUE) {
            echo mb_convert_encoding($showMessage." ", 'SJIS', 'utf8');
          
            $input = trim(fgets(STDIN));
    
            if ($validator->isValid($input)) {
                $key = $iterator->xpath($xpathQuery);
                $return = (string) $key[$input];            
                break;
            } else {
                foreach ($validator->getMessages() as $message) {
                    echo "$message\n";
                }
            }
        }
        
        return $return;
    }
    
    /**
     * Y(yes) or N (no) 
     *
     * @param string
     * @return boolean
     */
    function yesNo ($showMessage)
    {
        
        while (TRUE) {
            echo mb_convert_encoding($showMessage." ", 'SJIS', 'utf8');
          
            $input = strtolower(trim(fgets(STDIN)));
    
            if (strcmp($input, 'y') === 0){
                $boolean =TRUE;
                break;
            } elseif (strcmp($input, 'n') === 0) {
                $boolean =FALSE;
                break;
            }
            
        }
        
        return $boolean;
    }
}

$client = new Zend_Http_Client();
$interactive = new Diggin_Console_Interactive();
$moeten ='http://moeten.info/maidcafe/';

$query = array("m"=>"api");

//if ($interactive->yesNo("情報のタイプを指定しますか (y/N)")) {
    @$query["type"] = "list";
    $client->setUri($moeten."?".http_build_query($query));
    @$query["type"] = $interactive->select($client->request()->getBody(), '/channel/item/type', '/channel/item/keytype');
    $client->setUri($moeten."?".http_build_query($query));
//}
if ($interactive->yesNo("都道府県名を指定しますか (y/N)")) {
    @$query["tid"] = "list";
    $client->setUri($moeten."?".http_build_query($query));
    @$query["tid"] = $interactive->select($client->request()->getBody(), '/channel/item/todouhuken', '/channel/item/tid');
    $client->setUri($moeten."?".http_build_query($query));
}
if ($interactive->yesNo("カテゴリを指定しますか (y/N)")) {
    @$query["cid"] = "list";
    $client->setUri($moeten."?".http_build_query($query));
    @$query["cid"] = $interactive->select($client->request()->getBody(), '/channel/item/category', '/channel/item/cid');
    $client->setUri($moeten."?".http_build_query($query));
}
if ($interactive->yesNo("ショップを指定しますか (y/N)")) {
    @$query["sid"] = "list";
    $client->setUri($moeten."?".http_build_query($query));
    @$query["sid"] = $interactive->select($client->request()->getBody(), '/channel/item/shopname', '/channel/item/sid');
    $client->setUri($moeten."?".http_build_query($query));
}

print_r($client->getUri());
print_r($client->request()->getBody());

2007-12-02

<?php
//Diggin_Scraper_Simpleクラスは「前の日」においてます
//@see http://moshican.g.hatena.ne.jp/sasezaki/20071201
require_once '/Diggin/Scraper/Simple.php';
require_once 'Zend/Http/Client.php';
require_once 'Zend/Http/Client/Adapter/Test.php';

$adapter = new Zend_Http_Client_Adapter_Test();
$source = <<<EOF
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>Twitter / aaa

aa</title>
<title>Twi</title>
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="http://twitter.com/statuses/friends_timeline/xxxxx.rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="/rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="/?rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="../rss" />
 <link rel="alternate" type="application/rss+xml" title="hogehoge(RSS)" href="rss.xml" />  
</head>/rss
<body>
<img id="id" src="/images/button.gif" alt="gazou" />
<img id="id" src="../images/button.gif" alt="gazou" />
<img id="id" src="button.gif" alt="gazou" />
</body>
</html>
EOF;

$adapter->setResponse(
    "HTTP/1.1 200 OK"        . "\r\n" .
    "Content-type: text/xml" . "\r\n" .
                               "\r\n" .
    $source);
$url = "http://www.example.org/hoge.do";
$url2 = "http://www.example.org/hoge/do.php/test?test#test";
$client = new Zend_Http_Client($url, array('adapter' => $adapter));
$client2 = new Zend_Http_Client($url2, array('adapter' => $adapter));

$scraper = new Diggin_Scraper_Simple();
$scraper->setHttpClient($client);
//$scraper->scrape('head/link[@type="application/rss+xml"]');
print_r($scraper->getTitle());
print_r($scraper->discovery());
$scraper->setUrl($url2);
print_r($scraper->discovery());

print_r($scraper->discovery("//img","src"));

結果

Twitter / aaa aa

Array

(

[0] => http://twitter.com/statuses/friends_timeline/xxxxx.rss

[1] => http://www.example.org/rss

[2] => http://www.example.org/?rss

[4] => http://www.example.org/rss.xml

)

Array

(

[0] => http://twitter.com/statuses/friends_timeline/xxxxx.rss

[1] => http://www.example.org/rss

[2] => http://www.example.org/?rss

[3] => http://www.example.org/hoge/rss

[4] => http://www.example.org/hoge/do.php/rss.xml

)

Array

(

[0] => http://www.example.org/images/button.gif

[1] => http://www.example.org/hoge/images/button.gif

[2] => http://www.example.org/hoge/do.php/button.gif

)

2007-12-01

<?php
class Diggin_Scraper_Simple
{
    /**
     * Zend_Http_Client Object
     *
     * @var Zend_Http_Client
     */
    protected $_client;

    protected $_url;
    
    protected $_resetUrlFlg = FALSE;
    
    public function getUrl ()
    {
        return $this->_url;
    }

    public function setUrl ($url) 
    {
        $this->_resetUrlFlg = TRUE;
        $this->_url = $url;
    }
    
    /**
     * HTTP client object to use for retrieving
     *
     * @var Zend_Http_Client
     */
    protected static $_httpClient = null;

    /**
     * Override HTTP PUT and DELETE request methods?
     *
     * @var boolean
     */
    protected static $_httpMethodOverride = false;

    /**
     * Set the HTTP client instance
     *
     * Sets the HTTP client object to use for retrieving the feeds.
     *
     * @param  Zend_Http_Client $httpClient
     * @return void
     */
    public static function setHttpClient(Zend_Http_Client $httpClient)
    {
        self::$_httpClient = $httpClient;
    }

    /**
     * Gets the HTTP client object. If none is set, a new Zend_Http_Client will be used.
     *
     * @return Zend_Http_Client_Abstract
     */
    public static function getHttpClient()
    {
        if (!self::$_httpClient instanceof Zend_Http_Client) {
            /**
             * @see Zend_Http_Client
             */
            require_once 'Zend/Http/Client.php';
            self::$_httpClient = new Zend_Http_Client();
        }

        return self::$_httpClient;
    }

    /**
     * construct
     *
     * @param string 
     * @param  array
     * @return string
     */
    public function __construct($url = null, array $parms = array())
    {
        $this->_url = $url;
    }

    /**
     * 
     */
    public function makeRequest()
    {
        $this->_client = self::getHttpClient();
        
        if ($this->_url) {
            $this->_client->setUri($this->getUrl());
        }
        
        if (isset($parms)){
            $this->_client->setParameterGet($parms);
        }
        
        if ((!$this->_client->getLastResponse()) || ($this->_resetUrlFlg == TRUE)) {
            $response = $this->_client->request('GET');
            
            if (!$response->isSuccessful()) {
                 /**
                  * @see Diggin_Scraper_Exception
                  */
                 require_once 'Diggin/Scraper/Exception.php';
                 throw new Diggin_Scraper_Simple("Http client reported an error: '{$response->getMessage()}'");
            }
        } else {
            $response = $this->_client->getLastResponse();
        }

        $responseBody = $response->getBody();
        
        $tidy = new tidy;
        $config = array(
                    'indent'         => false,
                    'add-xml-decl'   => true,
                    'output-xml'     => true,
					'numeric-entities' => true,
        			'wrap'           => 200
                    );
        $tidy->parseString($responseBody, $config, 'utf8');
        $tidy->cleanRepair();
                
        $xml = new SimpleXMLElement($tidy->value);
        
        return $xml;
    }
    
    /**
     * 
     * @param string(xpath)
     * @return array
     */
    public function scrape ($xpath) 
    {
        $results = array();
        
        $xml = $this->makeRequest();
        foreach ($xml->xpath($xpath) as $count => $result) {
            $results[] = $result; 
        }
        
        return $results;
    }
    
    /**
     * discovery "Real"URL acording attribute with Xpath
     * 
     * @param string
     * @param string
     * @return array
     */
    public function discovery ($xpath = 'head/link[@type="application/rss+xml"]', $attribute = "href") 
    {
        $replaces = array();
        foreach ($this->getAttribute($xpath, $attribute) as $getAttribute) {
            $replaces[] = $this->getRealUrl($getAttribute);
        }
        
        return array_unique($replaces);
    }
    
    
    /**
     * Getting "Real" URL not URI
     * Replace HTML'S relative path
     *  
     * using pecl_http 
     * @see http://pecl4win.php.net/
     * 
     * @param string
     * @return string
     */
    public function getRealUrl ($href)
    {
        $parse = parse_url($href);
        if (isset($parse["host"])) {
            $build = $href;
        } else {
            $uri = $this->getHttpClient()->getUri(TRUE);
            $uridir = pathinfo(parse_url($uri, PHP_URL_PATH), PATHINFO_DIRNAME);
            $slash = strpos($uridir, '/');
            if ($slash === false) {
                $build = http_build_url($uri, array("path" => $href,),
                HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT);
            } else {            
                $build = http_build_url($uri, array("path" => $href,), 
                HTTP_URL_JOIN_PATH | HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT);
            }
        }
        
        return $build;
    }
    
    /**
     * Get Attribute
     * 
     * @param string
     * @param string
     * @return array
     */
    
    public function getAttribute($xpath, $attribute)
    {
        $getAttributes = array();
                
        $results = $this->scrape($xpath);
        foreach ($results as $result) {
            array_push($getAttributes, (string) $result[$attribute]);
        }
        
        return  $getAttributes;
    }

    
    /**
     * Getting head->title tag's value
     * 
     * @return string
     */
    public function getTitle ()
    {
        $results = $this->scrape('head/title');
                
        return trim((string) $results[0]); 
    }    
}