ixnode/php-web-crawler
最新稳定版本:0.1.24
Composer 安装命令:
composer require ixnode/php-web-crawler
包简介
PHP Web Crawler - This PHP class allows you to crawl recursively a given html page (or a given html file) and collect some data from it.
README 文档
README
This PHP class allows you to crawl recursively a given html page (or a given html file) and collect some data from it. Simply define the url (or a html file) and a set of xpath expressions which should map with the output data object. The final representation will be a php array which can be easily converted into the json format for further processing.
1. Installation
composer require ixnode/php-web-crawler
vendor/bin/php-web-crawler -V
php-web-crawler 0.1.0 (02-24-2024 14:46:26) - Björn Hempel <bjoern@hempel.li>
2. Usage
2.1 PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Value\Text; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <h1>Test Title</h1> <p>Test Paragraph</p> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('version', new Text('1.0.0')), new Field('title', new XpathTextNode('//h1')), new Field('paragraph', new XpathTextNode('//p')) ); $html->parse()->getJsonStringFormatted(); // See below
2.2 JSON result
{
"version": "1.0.0",
"title": "Test Title",
"paragraph": "Test Paragraph"
}
3. Advanced usage
3.1 Group
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'content', new Group( 'header', new Field('h1', new XpathTextNode('/html/body//h1')), ), new Group( 'text', new Field('p1', new XpathTextNode('/html/body//p[@class="paragraph-1"]')), new Field('p2', new XpathTextNode('/html/body//p[@class="paragraph-2"]')), ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{
"title": "Test Page",
"content": {
"header": {
"h1": "Test Title"
},
"text": {
"p1": "Test Paragraph 1",
"p2": "Test Paragraph 2"
}
}
}
3.2 XpathSection
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Source\XpathSection; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <div class="content"> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> </div> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'content', new XpathSection( '/html/body//div[@class="content"]', new Group( 'header', new Field('h1', new XpathTextNode('./h1')), ), new Group( 'text', new Field('p1', new XpathTextNode('./p[@class="paragraph-1"]')), new Field('p2', new XpathTextNode('./p[@class="paragraph-2"]')), ) ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{
"title": "Test Page",
"content": {
"header": {
"h1": "Test Title"
},
"text": {
"p1": "Test Paragraph 1",
"p2": "Test Paragraph 2"
}
}
}
3.3 XpathSection (flat)
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Source\XpathSections; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <div class="content"> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> <ul> <li>Test Item 1</li> <li>Test Item 2</li> </ul> </div> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'hits', new XpathSections( '/html/body//div[@class="content"]/ul', new XpathTextNode('./li/text()'), ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{
"title": "Test Page",
"hits": [
[
"Test Item 1",
"Test Item 2"
]
]
}
3.3 XpathSection (structured)
PHP Code
use Ixnode\PhpWebCrawler\Output\Field; use Ixnode\PhpWebCrawler\Output\Group; use Ixnode\PhpWebCrawler\Source\Raw; use Ixnode\PhpWebCrawler\Source\XpathSections; use Ixnode\PhpWebCrawler\Value\XpathTextNode; $rawHtml = <<<HTML <html> <head> <title>Test Page</title> </head> <body> <div class="content"> <h1>Test Title</h1> <p class="paragraph-1">Test Paragraph 1</p> <p class="paragraph-2">Test Paragraph 2</p> <table> <tbody> <tr> <th>Caption 1</th> <td>Cell 1</td> </tr> <tr> <th>Caption 2</th> <td>Cell 2</td> </tr> </tbody> </table> </div> </body> </html> HTML; $html = new Raw( $rawHtml, new Field('title', new XpathTextNode('/html/head/title')), new Group( 'hits', new XpathSections( '/html/body//div[@class="content"]/table/tbody/tr', new Field('caption', new XpathTextNode('./th/text()')), new Field('content', new XpathTextNode('./td/text()')), ) ) ); $html->parse()->getJsonStringFormatted(); // See below
JSON result
{
"title": "Test Page",
"hits": [
{
"caption": "Caption 1",
"content": "Cell 1"
},
{
"caption": "Caption 2",
"content": "Cell 2"
}
]
}
4. More examples
- examples/converter.php
- examples/group.php
- examples/section.php
- examples/sections-recursive-url.php
- examples/sections.php
- examples/simple-wiki-page.php
5. Development
git clone git@github.com:ixnode/php-web-crawler.git && cd php-web-crawler
composer install
composer test
6. License
This library is licensed under the MIT License - see the LICENSE.md file for details.
统计信息
- 总下载量: 45
- 月度下载量: 0
- 日度下载量: 0
- 收藏数: 2
- 点击次数: 2
- 依赖项目数: 0
- 推荐数: 0
其他信息
- 授权协议: MIT
- 更新时间: 2024-02-24